In [5]:
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import plotly.express as px
import plotly.graph_objects as go
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import missingno as msno 
import os

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import Counter
from wordcloud import WordCloud 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, f1_score, recall_score
from sklearn.metrics import precision_recall_curve, average_precision_score

from yellowbrick.classifier import PrecisionRecallCurve

import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

# Importing plotly and cufflinks in offline mode
import cufflinks as cf
import plotly.express as px
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

# Figure&Display options
%matplotlib inline
fig, ax = plt.subplots()
# fig.set_size_inches(10, 6)
plt.rcParams["figure.figsize"] = (12, 8)  # the size of A4 paper use (11.7, 8.27)
pd.set_option('max_colwidth', 200)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# !pip install termcolor
import colorama
from colorama import Fore, Style  # maakes strings colored
from termcolor import colored

import ipywidgets
from ipywidgets import interact

# !pip install -U pandas-profiling --user
# !pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
import pandas_profiling
from pandas_profiling.report.presentation.flavours.html.templates import create_html_assets
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
---------------------------------------------------------------------------
PydanticImportError                       Traceback (most recent call last)
Cell In[5], line 77
     74 # !pip install -U pandas-profiling --user
     75 # !pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
     76 import pandas_profiling
---> 77 from pandas_profiling.report.presentation.flavours.html.templates import create_html_assets

File ~\AppData\Roaming\Python\Python311\site-packages\pandas_profiling\report\__init__.py:2
      1 """All functionality concerned with presentation to the user."""
----> 2 from pandas_profiling.report.structure.report import get_report_structure
      4 __all__ = ["get_report_structure"]

File ~\AppData\Roaming\Python\Python311\site-packages\pandas_profiling\report\structure\report.py:7
      4 import pandas as pd
      5 from tqdm.auto import tqdm
----> 7 from pandas_profiling.config import Settings
      8 from pandas_profiling.model.alerts import AlertType
      9 from pandas_profiling.model.handler import get_render_map

File ~\AppData\Roaming\Python\Python311\site-packages\pandas_profiling\config.py:5
      2 from enum import Enum
      3 from typing import Any, Dict, List, Optional
----> 5 from pydantic import BaseModel, BaseSettings, Field
      8 def _merge_dictionaries(dict1: dict, dict2: dict) -> dict:
      9     """
     10     Recursive merge dictionaries.
     11 
   (...)
     14     :return: Merged dictionary
     15     """

File ~\AppData\Roaming\Python\Python311\site-packages\pydantic\__init__.py:386, in __getattr__(attr_name)
    384 dynamic_attr = _dynamic_imports.get(attr_name)
    385 if dynamic_attr is None:
--> 386     return _getattr_migration(attr_name)
    388 package, module_name = dynamic_attr
    390 from importlib import import_module

File ~\AppData\Roaming\Python\Python311\site-packages\pydantic\_migration.py:296, in getattr_migration.<locals>.wrapper(name)
    294     return import_string(REDIRECT_TO_V1[import_path])
    295 if import_path == 'pydantic:BaseSettings':
--> 296     raise PydanticImportError(
    297         '`BaseSettings` has been moved to the `pydantic-settings` package. '
    298         f'See https://docs.pydantic.dev/{version_short()}/migration/#basesettings-has-moved-to-pydantic-settings '
    299         'for more details.'
    300     )
    301 if import_path in REMOVED_IN_V2:
    302     raise PydanticImportError(f'`{import_path}` has been removed in V2.')

PydanticImportError: `BaseSettings` has been moved to the `pydantic-settings` package. See https://docs.pydantic.dev/2.7/migration/#basesettings-has-moved-to-pydantic-settings for more details.

For further information visit https://errors.pydantic.dev/2.7/u/import-error
In [7]:
def missing_values(df):
    missing_number = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return missing_values[missing_values['Missing_Number']>0]
In [6]:
def first_looking(df):
    print(colored("Shape:", attrs=['bold']), df.shape,'\n', 
          colored('-'*79, 'red', attrs=['bold']),
          colored("\nInfo:\n", attrs=['bold']), sep='')
    print(df.info(), '\n', 
          colored('-'*79, 'red', attrs=['bold']), sep='')
    print(colored("Number of Uniques:\n", attrs=['bold']), df.nunique(),'\n',
          colored('-'*79, 'red', attrs=['bold']), sep='')
    print(colored("Missing Values:\n", attrs=['bold']), missing_values(df),'\n', 
          colored('-'*79, 'red', attrs=['bold']), sep='')
    print(colored("All Columns:", attrs=['bold']), list(df.columns),'\n', 
          colored('-'*79, 'red', attrs=['bold']), sep='')

    df.columns= df.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')

    print(colored("Columns after rename:", attrs=['bold']), list(df.columns),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')
    
        
def multicolinearity_control(df):
    feature =[]
    collinear=[]
    for col in df.corr().columns:
        for i in df.corr().index:
            if (abs(df.corr()[col][i])> .9 and abs(df.corr()[col][i]) < 1):
                    feature.append(col)
                    collinear.append(i)
                    print(colored(f"Multicolinearity alert in between:{col} - {i}", 
                                  "red", attrs=['bold']), df.shape,'\n',
                                  colored('-'*79, 'red', attrs=['bold']), sep='')

def duplicate_values(df):
    print(colored("Duplicate check...", attrs=['bold']), sep='')
    duplicate_values = df.duplicated(subset=None, keep='first').sum()
    if duplicate_values > 0:
        df.drop_duplicates(keep='first', inplace=True)
        print(duplicate_values, colored("Duplicates were dropped!"),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')
    else:
        print(colored("There are no duplicates"),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')     
        
def drop_columns(df, drop_columns):
    if drop_columns !=[]:
        df.drop(drop_columns, axis=1, inplace=True)
        print(drop_columns, 'were dropped')
    else:
        print(colored('We will now check the missing values and if necessary will drop realted columns!', attrs=['bold']),'\n',
              colored('-'*79, 'red', attrs=['bold']), sep='')
        
def drop_null(df, limit):
    print('Shape:', df.shape)
    for i in df.isnull().sum().index:
        if (df.isnull().sum()[i]/df.shape[0]*100)>limit:
            print(df.isnull().sum()[i], 'percent of', i ,'null and were dropped')
            df.drop(i, axis=1, inplace=True)
            print('new shape:', df.shape)       
    print('New shape after missing value control:', df.shape)
    
###############################################################################

# To view summary information about the column

def first_look(col):
    print("column name    : ", col)
    print("--------------------------------")
    print("per_of_nulls   : ", "%", round(df[col].isnull().sum()/df.shape[0]*100, 2))
    print("num_of_nulls   : ", df[col].isnull().sum())
    print("num_of_uniques : ", df[col].nunique())
    print(df[col].value_counts(dropna = False))
    
In [8]:
os.chdir('C:\\Users\\aksha\\OneDrive\\Desktop\\Placement\\Self Project\\NLP')
df0=pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df = df0.copy()
df.head()
Out[8]:
Unnamed: 0 Clothing ID Age Title Review Text Rating Recommended IND Positive Feedback Count Division Name Department Name Class Name
0 0 767 33 NaN Absolutely wonderful - silky and sexy and comfortable 4 1 0 Initmates Intimate Intimates
1 1 1080 34 NaN Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite. i bought a petite and am 5'8". i love the length... 5 1 4 General Dresses Dresses
2 2 1077 60 Some major design flaws I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i co... 3 0 0 General Dresses Dresses
3 3 1049 50 My favorite buy! I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments! 5 1 0 General Petite Bottoms Pants
4 4 847 47 Flattering shirt This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!! 5 1 6 General Tops Blouses
In [9]:
df.profile_report()
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[9]:

In [10]:
first_looking(df)
Shape:(23486, 11)
-------------------------------------------------------------------------------
Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB
None
-------------------------------------------------------------------------------
Number of Uniques:
Unnamed: 0                 23486
Clothing ID                 1206
Age                           77
Title                      13993
Review Text                22634
Rating                         5
Recommended IND                2
Positive Feedback Count       82
Division Name                  3
Department Name                6
Class Name                    20
dtype: int64
-------------------------------------------------------------------------------
Missing Values:
                 Missing_Number  Missing_Percent
Title                      3810             0.16
Review Text                 845             0.04
Division Name                14             0.00
Department Name              14             0.00
Class Name                   14             0.00
-------------------------------------------------------------------------------
All Columns:['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name']
-------------------------------------------------------------------------------
Columns after rename:['unnamed:_0', 'clothing_id', 'age', 'title', 'review_text', 'rating', 'recommended_ind', 'positive_feedback_count', 'division_name', 'department_name', 'class_name']
-------------------------------------------------------------------------------
In [11]:
df.head(1)
Out[11]:
unnamed:_0 clothing_id age title review_text rating recommended_ind positive_feedback_count division_name department_name class_name
0 0 767 33 NaN Absolutely wonderful - silky and sexy and comfortable 4 1 0 Initmates Intimate Intimates
In [12]:
df.sample(3)
Out[12]:
unnamed:_0 clothing_id age title review_text rating recommended_ind positive_feedback_count division_name department_name class_name
8339 8339 1098 42 NaN NaN 5 1 0 General Dresses Dresses
1615 1615 825 45 Perfect Received this shirt yesterday and loved it. doesn't look that great online in pics, but it is a unique and beautiful top. the material is a substantial woven cotton so it lays nicely and keeps its... 5 1 2 General Petite Tops Blouses
7289 7289 862 28 Date night! I'm always weary of v necks but this shirt is great. the fit is true to size and while it is a v neck, it's not such a deep plunge that you need a cami underneath. great material too and perfect f... 5 1 0 General Tops Knits
In [13]:
df.shape
Out[13]:
(23486, 11)
In [14]:
df.drop("unnamed:_0", axis=1, inplace=True)
df.head(1)
Out[14]:
clothing_id age title review_text rating recommended_ind positive_feedback_count division_name department_name class_name
0 767 33 NaN Absolutely wonderful - silky and sexy and comfortable 4 1 0 Initmates Intimate Intimates
In [15]:
df.describe().T.style.background_gradient(subset=['mean','std','50%','count'], cmap='RdPu').format('{:.2f}')
Out[15]:
  count mean std min 25% 50% 75% max
clothing_id 23486.00 918.12 203.30 0.00 861.00 936.00 1078.00 1205.00
age 23486.00 43.20 12.28 18.00 34.00 41.00 52.00 99.00
rating 23486.00 4.20 1.11 1.00 4.00 5.00 5.00 5.00
recommended_ind 23486.00 0.82 0.38 0.00 1.00 1.00 1.00 1.00
positive_feedback_count 23486.00 2.54 5.70 0.00 0.00 1.00 3.00 122.00
In [16]:
df.describe(include=object).T
Out[16]:
count unique top freq
title 19676 13993 Love it! 136
review_text 22641 22634 Perfect fit and i've gotten so many compliments. i buy all my suits from here now! 3
division_name 23472 3 General 13850
department_name 23472 6 Tops 10468
class_name 23472 20 Dresses 6319
In [17]:
# to find how many unique values numerical features have

for col in df.select_dtypes(include=[np.number]).columns:
  print(colored(f"{col}", 'green', attrs=['bold']), f"feature has", colored(f"{df[col].nunique()}", 'green', attrs=['bold']), f"unique values.")
clothing_id feature has 1206 unique values.
age feature has 77 unique values.
rating feature has 5 unique values.
recommended_ind feature has 2 unique values.
positive_feedback_count feature has 82 unique values.
In [18]:
# to find how many unique values object features have

for col in df.select_dtypes(include="object").columns:
  print(colored(f"{col}", 'green', attrs=['bold']), f"feature has", colored(f"{df[col].nunique()}", 'green', attrs=['bold']), f"unique values.")
title feature has 13993 unique values.
review_text feature has 22634 unique values.
division_name feature has 3 unique values.
department_name feature has 6 unique values.
class_name feature has 20 unique values.
In [20]:
df.columns
Out[20]:
Index(['clothing_id', 'age', 'title', 'review_text', 'rating',
       'recommended_ind', 'positive_feedback_count', 'division_name',
       'department_name', 'class_name'],
      dtype='object')
In [21]:
df.head(2)
Out[21]:
clothing_id age title review_text rating recommended_ind positive_feedback_count division_name department_name class_name
0 767 33 NaN Absolutely wonderful - silky and sexy and comfortable 4 1 0 Initmates Intimate Intimates
1 1080 34 NaN Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite. i bought a petite and am 5'8". i love the length... 5 1 4 General Dresses Dresses
In [22]:
df["recommended_ind"].value_counts()
Out[22]:
recommended_ind
1    19314
0     4172
Name: count, dtype: int64
In [23]:
df["recommended_ind"].value_counts()
Out[23]:
recommended_ind
1    19314
0     4172
Name: count, dtype: int64
In [24]:
first_look("recommended_ind")
column name    :  recommended_ind
--------------------------------
per_of_nulls   :  % 0.0
num_of_nulls   :  0
num_of_uniques :  2
recommended_ind
1    19314
0     4172
Name: count, dtype: int64
In [25]:
df["recommended_ind"].describe().T
Out[25]:
count   23486.00
mean        0.82
std         0.38
min         0.00
25%         1.00
50%         1.00
75%         1.00
max         1.00
Name: recommended_ind, dtype: float64
In [26]:
sns.countplot(x = df.recommended_ind, data = df)
plt.title('Customer Recommendation Distribution', fontsize=30)
plt.xlabel("Recommendation Label", fontsize=24)
plt.ylabel("The Number of Recommendations", fontsize=24)

for index,value in enumerate(df.recommended_ind.value_counts().sort_values()):
     plt.text(index, value, f"{value}", ha="center", va="bottom", fontsize = 13);
In [27]:
plt.figure(figsize=(8, 8))

explode = [0, 0.1]
plt.pie(df['recommended_ind'].value_counts(), explode=explode, autopct='%1.1f%%', shadow=True, startangle=140)
plt.legend(labels=['1', '0'])
plt.title('Customer Recommendation Distribution', fontsize=20)
plt.axis('off');
In [28]:
df["rating"].value_counts()
Out[28]:
rating
5    13131
4     5077
3     2871
2     1565
1      842
Name: count, dtype: int64
In [29]:
first_look("rating")
column name    :  rating
--------------------------------
per_of_nulls   :  % 0.0
num_of_nulls   :  0
num_of_uniques :  5
rating
5    13131
4     5077
3     2871
2     1565
1      842
Name: count, dtype: int64
In [30]:
df["rating"].describe().T
Out[30]:
count   23486.00
mean        4.20
std         1.11
min         1.00
25%         4.00
50%         5.00
75%         5.00
max         5.00
Name: rating, dtype: float64
In [31]:
sns.countplot(x = df.rating, data = df)
plt.title('Customer Rating Distribution', fontsize=30)
plt.xlabel("Rating Label", fontsize=24)
plt.ylabel("The Number of Rating", fontsize=24)

for index,value in enumerate(df.rating.value_counts().sort_values()):
     plt.text(index, value, f"{value}", ha="center", va="bottom", fontsize = 13);
In [32]:
plt.figure(figsize=(8, 8))

explode = [0.1, 0, 0, 0, 0.1]
plt.pie(df['rating'].value_counts(), explode=explode, autopct='%1.1f%%', shadow=True, startangle=140)
plt.legend(labels=['1', '2', '3', '4','5'])
plt.title('Customer Rating Distribution', fontsize=20)
plt.axis('off');
In [33]:
df["age"].value_counts()
Out[33]:
age
39    1269
35     909
36     842
34     804
38     780
37     766
41     741
33     725
46     713
42     651
32     631
48     626
40     617
44     617
43     579
31     569
47     564
53     560
45     529
29     513
49     490
56     471
52     442
28     428
26     423
30     407
50     398
54     395
51     393
57     363
27     344
60     341
62     338
25     331
55     328
59     321
58     277
66     276
64     247
23     247
24     245
63     239
61     227
65     226
67     157
22     146
68     141
69     113
20     108
21     102
70      93
71      51
74      50
72      46
83      43
19      40
73      40
75      26
77      18
78      15
79      15
82      13
76      10
80      10
85       6
84       6
89       5
81       5
91       5
18       4
87       4
94       3
93       2
90       2
86       2
99       2
92       1
Name: count, dtype: int64
In [34]:
first_look("age")
column name    :  age
--------------------------------
per_of_nulls   :  % 0.0
num_of_nulls   :  0
num_of_uniques :  77
age
39    1269
35     909
36     842
34     804
38     780
37     766
41     741
33     725
46     713
42     651
32     631
48     626
40     617
44     617
43     579
31     569
47     564
53     560
45     529
29     513
49     490
56     471
52     442
28     428
26     423
30     407
50     398
54     395
51     393
57     363
27     344
60     341
62     338
25     331
55     328
59     321
58     277
66     276
64     247
23     247
24     245
63     239
61     227
65     226
67     157
22     146
68     141
69     113
20     108
21     102
70      93
71      51
74      50
72      46
83      43
19      40
73      40
75      26
77      18
78      15
79      15
82      13
76      10
80      10
85       6
84       6
89       5
81       5
91       5
18       4
87       4
94       3
93       2
90       2
86       2
99       2
92       1
Name: count, dtype: int64
In [35]:
df["age"].describe().T
Out[35]:
count   23486.00
mean       43.20
std        12.28
min        18.00
25%        34.00
50%        41.00
75%        52.00
max        99.00
Name: age, dtype: float64
In [36]:
plt.figure(figsize = (20, 8))
plt.title('Customer Age Distribution', fontsize=30)
plt.xlabel("Age", fontsize=24)
plt.ylabel("The Number of Customer Age", fontsize=18)

sns.histplot(df, x='age', kde = True, bins = 50);
In [37]:
fig_dims = (30, 10)
fig, ax = plt.subplots(figsize=fig_dims)
sns.barplot(data = df, x = "age", y = "recommended_ind", ax=ax)

plt.title('Customer Age Distribution By Recommendations', fontsize=30)
plt.xlabel("Age", fontsize=24)
plt.ylabel("Customer Recommendation Ratio", fontsize=18)
plt.xticks(rotation = 45);
In [38]:
fig = px.histogram(df['age'], color=df['recommended_ind'],
                   labels={'value': 'Age', 'color': 'Recommended'}, 
                   color_discrete_map={0: "magenta", 1: "MediumPurple"},
                   marginal='box')
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Distribution of the Age and Recommendation',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(barmode='overlay')
fig.show()
In [39]:
df["positive_feedback_count"].value_counts()
Out[39]:
positive_feedback_count
0      11176
1       4043
2       2193
3       1433
4        922
5        673
6        525
7        374
8        319
9        261
10       225
11       178
12       146
14       121
13       102
15        94
17        81
16        74
18        62
19        54
20        40
23        31
21        30
22        29
25        25
28        24
26        23
24        21
27        20
30        18
31        17
29        15
32        10
42         9
37         9
38         8
34         7
35         6
36         6
33         6
51         5
81         4
45         4
39         4
65         4
43         4
40         3
47         3
57         3
41         3
44         3
53         2
49         2
46         2
58         2
55         2
64         1
89         1
68         1
56         1
48         1
98         1
93         1
95         1
50         1
99         1
87         1
69         1
66         1
54         1
108        1
122        1
52         1
78         1
82         1
61         1
77         1
94         1
117        1
71         1
84         1
59         1
Name: count, dtype: int64
In [40]:
first_look("positive_feedback_count")
column name    :  positive_feedback_count
--------------------------------
per_of_nulls   :  % 0.0
num_of_nulls   :  0
num_of_uniques :  82
positive_feedback_count
0      11176
1       4043
2       2193
3       1433
4        922
5        673
6        525
7        374
8        319
9        261
10       225
11       178
12       146
14       121
13       102
15        94
17        81
16        74
18        62
19        54
20        40
23        31
21        30
22        29
25        25
28        24
26        23
24        21
27        20
30        18
31        17
29        15
32        10
42         9
37         9
38         8
34         7
35         6
36         6
33         6
51         5
81         4
45         4
39         4
65         4
43         4
40         3
47         3
57         3
41         3
44         3
53         2
49         2
46         2
58         2
55         2
64         1
89         1
68         1
56         1
48         1
98         1
93         1
95         1
50         1
99         1
87         1
69         1
66         1
54         1
108        1
122        1
52         1
78         1
82         1
61         1
77         1
94         1
117        1
71         1
84         1
59         1
Name: count, dtype: int64
In [41]:
df["positive_feedback_count"].describe().T
Out[41]:
count   23486.00
mean        2.54
std         5.70
min         0.00
25%         0.00
50%         1.00
75%         3.00
max       122.00
Name: positive_feedback_count, dtype: float64
In [42]:
plt.figure(figsize = (20, 8))
plt.title('Customer Positive Feedback Distribution', fontsize=20)
plt.xlabel("Customer Positive Feedback", fontsize=24)
plt.ylabel("The Number of Customer Positive Feedback", fontsize=18)

sns.histplot(df, x='positive_feedback_count', kde = True, bins = 50);
In [43]:
fig_dims = (30, 10)
fig, ax = plt.subplots(figsize=fig_dims)
sns.barplot(data = df, x = "positive_feedback_count", y = "age", ax=ax)

plt.xlabel("Customer Positive Feedback", fontsize=24)
plt.ylabel("Age", fontsize=24)

plt.title('Customer Positive Feedback Distribution By Age', fontsize=30)
plt.xticks(rotation = 45);
In [44]:
df.columns
Out[44]:
Index(['clothing_id', 'age', 'title', 'review_text', 'rating',
       'recommended_ind', 'positive_feedback_count', 'division_name',
       'department_name', 'class_name'],
      dtype='object')
In [45]:
df["division_name"].value_counts()
Out[45]:
division_name
General           13850
General Petite     8120
Initmates          1502
Name: count, dtype: int64
In [46]:
first_look("division_name")
column name    :  division_name
--------------------------------
per_of_nulls   :  % 0.06
num_of_nulls   :  14
num_of_uniques :  3
division_name
General           13850
General Petite     8120
Initmates          1502
NaN                  14
Name: count, dtype: int64
In [47]:
df["division_name"].describe().T
Out[47]:
count       23472
unique          3
top       General
freq        13850
Name: division_name, dtype: object
In [48]:
g = sns.catplot( x='division_name',
             kind="count", 
             data=df,
             height=5,
             aspect=2)

plt.title('Division Distribution', fontsize=24)
plt.xlabel("Division Name", fontsize=24)
plt.ylabel("The Number of Divisions", fontsize=20)

ax = g.facet_axis(0, 0)
for p in ax.patches:
    ax.text(p.get_x() + 0.28, 
            p.get_height() * 1.025, 
            '{0:.0f}'.format(p.get_height()), 
            color='black', rotation='horizontal', size='large')

plt.show()
In [49]:
plt.figure(figsize=(8, 8))

explode = [0.1, 0.1, 0]
plt.pie(df['division_name'].value_counts(), explode=explode, autopct='%1.1f%%', shadow=True, startangle=140)
plt.legend(labels=['1', '2', '3'])
plt.title('Division Distribution', fontsize=20)
plt.axis('off');
In [50]:
g = sns.catplot(data = df, x ="division_name", hue = "recommended_ind", kind='count', height=5, aspect=2, legend_out=False)

plt.title('Division Distribution By Recommendation', fontsize=24)
plt.xlabel("Division Name By Recommendation", fontsize=20)
plt.ylabel("The Number of Divisions", fontsize=20)
plt.legend(title='Recommendation Indicator', loc='upper left', labels=['Not Recomnended', 'Recomnended'])

ax = g.facet_axis(0, 0)
for p in ax.patches:
    ax.text(p.get_x() + 0.12, 
            p.get_height() * 1.025, 
            '{0:.0f}'.format(p.get_height()), 
            color='black', rotation='horizontal', size='large')

plt.show()
In [51]:
g = sns.catplot(data = df, x ="rating", hue = "division_name", kind='count', height=5, aspect=2, legend_out=False)

plt.title('Rating Distribution By Division', fontsize=24)
plt.xlabel("Ratings By Division", fontsize=20)
plt.ylabel("The Number of Ratings", fontsize=20)
plt.legend(title='Division Name', loc='upper left', labels=['Intimates', 'General', 'General Petite'])

ax = g.facet_axis(0, 0)
for p in ax.patches:
    ax.text(p.get_x() + 0.04, 
            p.get_height() * 1.025, 
            '{0:.0f}'.format(p.get_height()), 
            color='black', rotation='horizontal', size='large')

plt.show()
In [52]:
df["department_name"].value_counts()
Out[52]:
department_name
Tops        10468
Dresses      6319
Bottoms      3799
Intimate     1735
Jackets      1032
Trend         119
Name: count, dtype: int64
In [53]:
first_look("department_name")
column name    :  department_name
--------------------------------
per_of_nulls   :  % 0.06
num_of_nulls   :  14
num_of_uniques :  6
department_name
Tops        10468
Dresses      6319
Bottoms      3799
Intimate     1735
Jackets      1032
Trend         119
NaN            14
Name: count, dtype: int64
In [54]:
df["department_name"].describe().T
Out[54]:
count     23472
unique        6
top        Tops
freq      10468
Name: department_name, dtype: object
In [55]:
g = sns.catplot(data = df, x ="department_name", kind='count', height=5, aspect=2)

plt.title('Department Distribution', fontsize=26)
plt.xlabel("Department Name", fontsize=20)
plt.ylabel("The Number of Departments", fontsize=20)

ax = g.facet_axis(0, 0)
for p in ax.patches:
    ax.text(p.get_x() + 0.28, 
            p.get_height() * 1.025, 
            '{0:.0f}'.format(p.get_height()), 
            color='black', rotation='horizontal', size='large')

plt.show()
In [56]:
plt.figure(figsize=(8, 8))

explode = [0.1, 0, 0, 0, 0, 0]
plt.pie(df['department_name'].value_counts(), explode=explode, autopct='%1.1f%%', shadow=True, startangle=140)
plt.legend(labels=['Tops', 'Dresses', 'Bottoms', 'Intimate', 'Jackets', 'Trend'])
plt.title('Department Distribution', fontsize=20)
plt.axis('off');
In [57]:
g = sns.catplot(data = df, x ="department_name", hue = "recommended_ind", kind='count', height=7, aspect=2.5, legend_out=False)

plt.title('Department Distribution By Recommendation', fontsize=26)
plt.xlabel("Department Name", fontsize=20)
plt.ylabel("The Number of Recommendations", fontsize=20)
plt.legend(title='Recommendation Indicator', loc='upper left', labels=['Not Recomnended', 'Recomnended'], fontsize='x-large', title_fontsize='24')

ax = g.facet_axis(0, 0)
for p in ax.patches:
    ax.text(p.get_x() + 0.12, 
            p.get_height() * 1.025, 
            '{0:.0f}'.format(p.get_height()), 
            color='black', rotation='horizontal', size='large')

plt.show()
In [58]:
g = sns.catplot(data = df, x ="rating", hue = "department_name", kind='count', height=10, aspect=2.5, legend_out=False)

plt.title('Department Distribution By Recommendation', fontsize=26)
plt.xlabel("Department Name", fontsize=20)
plt.ylabel("The Number of Recommendations", fontsize=20)
plt.legend(title='Department Name', loc='upper left', labels=['Intimates', 'Dresses', 'Bottoms', 'Tops', 'Jackets', 'Trend'], fontsize='x-large', title_fontsize='24')
plt.figure(figsize=(15, 8))

ax = g.facet_axis(0, 0)
for p in ax.patches:
    ax.text(p.get_x() + 0.025, 
            p.get_height() * 1.025, 
            '{0:.0f}'.format(p.get_height()), 
            color='black', rotation='horizontal', size='large')

plt.show()
<Figure size 1500x800 with 0 Axes>
In [59]:
df["class_name"].value_counts()
Out[59]:
class_name
Dresses           6319
Knits             4843
Blouses           3097
Sweaters          1428
Pants             1388
Jeans             1147
Fine gauge        1100
Skirts             945
Jackets            704
Lounge             691
Swim               350
Outerwear          328
Shorts             317
Sleep              228
Legwear            165
Intimates          154
Layering           146
Trend              119
Casual bottoms       2
Chemises             1
Name: count, dtype: int64
In [60]:
first_look("class_name")
column name    :  class_name
--------------------------------
per_of_nulls   :  % 0.06
num_of_nulls   :  14
num_of_uniques :  20
class_name
Dresses           6319
Knits             4843
Blouses           3097
Sweaters          1428
Pants             1388
Jeans             1147
Fine gauge        1100
Skirts             945
Jackets            704
Lounge             691
Swim               350
Outerwear          328
Shorts             317
Sleep              228
Legwear            165
Intimates          154
Layering           146
Trend              119
NaN                 14
Casual bottoms       2
Chemises             1
Name: count, dtype: int64
In [61]:
df["class_name"].describe().T
Out[61]:
count       23472
unique         20
top       Dresses
freq         6319
Name: class_name, dtype: object
In [62]:
plt.title('Product Class Distribution', fontsize=25)
df["class_name"].value_counts().plot(kind="pie", autopct='%1.1f%%', figsize=(16, 16));
In [63]:
g = sns.catplot(data = df, x ="department_name", hue = "rating", kind='count', height=10, aspect=2.5)

sns.set(rc = {'figure.figsize':(30, 12)})
plt.title('Department Distribution By Rating', fontsize=30)
plt.xlabel("Department Name", fontsize=24)
plt.ylabel("The Number of Ratings", fontsize=24)
plt.legend(title='Rating Label', loc='upper left', labels=['1', '2', '3', '4', '5'], fontsize='x-large', title_fontsize='24')

ax = g.facet_axis(0, 0)
for p in ax.patches:
    ax.text(p.get_x() + 0.02, 
            p.get_height() * 1.025, 
            '{0:.0f}'.format(p.get_height()), 
            color='black', rotation='horizontal', size='large', fontsize = 18)

plt.show()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[63], line 11
      9 ax = g.facet_axis(0, 0)
     10 for p in ax.patches:
---> 11     ax.text(p.get_x() + 0.02, 
     12             p.get_height() * 1.025, 
     13             '{0:.0f}'.format(p.get_height()), 
     14             color='black', rotation='horizontal', size='large', fontsize = 18)
     16 plt.show()

File C:\ProgramData\anaconda3\Lib\site-packages\matplotlib\axes\_axes.py:689, in Axes.text(self, x, y, s, fontdict, **kwargs)
    628 """
    629 Add text to the Axes.
    630 
   (...)
    679     >>> text(x, y, s, bbox=dict(facecolor='red', alpha=0.5))
    680 """
    681 effective_kwargs = {
    682     'verticalalignment': 'baseline',
    683     'horizontalalignment': 'left',
   (...)
    687     **kwargs,
    688 }
--> 689 t = mtext.Text(x, y, text=s, **effective_kwargs)
    690 t.set_clip_path(self.patch)
    691 self._add_text(t)

File C:\ProgramData\anaconda3\Lib\site-packages\matplotlib\_api\deprecation.py:454, in make_keyword_only.<locals>.wrapper(*args, **kwargs)
    448 if len(args) > name_idx:
    449     warn_deprecated(
    450         since, message="Passing the %(name)s %(obj_type)s "
    451         "positionally is deprecated since Matplotlib %(since)s; the "
    452         "parameter will become keyword-only %(removal)s.",
    453         name=name, obj_type=f"parameter of {func.__name__}()")
--> 454 return func(*args, **kwargs)

File C:\ProgramData\anaconda3\Lib\site-packages\matplotlib\text.py:183, in Text.__init__(self, x, y, text, color, verticalalignment, horizontalalignment, multialignment, fontproperties, rotation, linespacing, rotation_mode, usetex, wrap, transform_rotates_text, parse_math, **kwargs)
    167 self._text = ''
    168 self._reset_visual_defaults(
    169     text=text,
    170     color=color,
   (...)
    181     rotation_mode=rotation_mode,
    182 )
--> 183 self.update(kwargs)

File C:\ProgramData\anaconda3\Lib\site-packages\matplotlib\text.py:223, in Text.update(self, kwargs)
    221 def update(self, kwargs):
    222     # docstring inherited
--> 223     kwargs = cbook.normalize_kwargs(kwargs, Text)
    224     sentinel = object()  # bbox can be None, so use another sentinel.
    225     # Update fontproperties first, as it has lowest priority.

File C:\ProgramData\anaconda3\Lib\site-packages\matplotlib\cbook\__init__.py:1779, in normalize_kwargs(kw, alias_mapping)
   1777 canonical = to_canonical.get(k, k)
   1778 if canonical in canonical_to_seen:
-> 1779     raise TypeError(f"Got both {canonical_to_seen[canonical]!r} and "
   1780                     f"{k!r}, which are aliases of one another")
   1781 canonical_to_seen[canonical] = k
   1782 ret[canonical] = v

TypeError: Got both 'size' and 'fontsize', which are aliases of one another
In [67]:
df_cat = df[['division_name', 'department_name', 'class_name', "recommended_ind"]]
df_cat["recommended_ind"] = df_cat["recommended_ind"].apply(lambda x: "Recommended" if x>=1 else "Not Recommended")
df_cat.rename({'division_name': 'Division Name', 'department_name': 'Department Name', 'class_name': 'Class Name', 'recommended_ind': 'Recommendation Indicator'}, axis=1, inplace=True)
df_cat
Out[67]:
Division Name Department Name Class Name Recommendation Indicator
0 Initmates Intimate Intimates Recommended
1 General Dresses Dresses Recommended
2 General Dresses Dresses Not Recommended
3 General Petite Bottoms Pants Recommended
4 General Tops Blouses Recommended
... ... ... ... ...
23481 General Petite Dresses Dresses Recommended
23482 General Petite Tops Knits Recommended
23483 General Petite Dresses Dresses Not Recommended
23484 General Dresses Dresses Recommended
23485 General Petite Dresses Dresses Recommended

23486 rows × 4 columns

In [68]:
df_num = df[['age', 'rating', 'positive_feedback_count', 'recommended_ind']]
df_num["recommended_ind"] = df_num["recommended_ind"].apply(lambda x: "Recommended" if x>=1 else "Not Recommended")
df_num.rename({'age': 'Age', 'rating': 'Rating', 'positive_feedback_count': 'Positive Feedback', 'recommended_ind': 'Recommendation Indicator'}, axis=1, inplace=True)
df_num
Out[68]:
Age Rating Positive Feedback Recommendation Indicator
0 33 4 0 Recommended
1 34 5 4 Recommended
2 60 3 0 Not Recommended
3 50 5 0 Recommended
4 47 5 6 Recommended
... ... ... ... ...
23481 34 5 0 Recommended
23482 48 3 0 Recommended
23483 31 3 1 Not Recommended
23484 28 3 2 Recommended
23485 52 5 22 Recommended

23486 rows × 4 columns

In [69]:
for i, col in enumerate(df_cat.columns):
    xtab = pd.crosstab(df_cat[col], df_cat["Recommendation Indicator"], normalize=True)
    print(colored('-'*55, 'red', attrs=['bold']), sep='')
    print(xtab*100)
-------------------------------------------------------
Recommendation Indicator  Not Recommended  Recommended
Division Name                                         
General                             10.81        48.20
General Petite                       6.02        28.57
Initmates                            0.95         5.45
-------------------------------------------------------
Recommendation Indicator  Not Recommended  Recommended
Department Name                                       
Bottoms                              2.41        13.78
Dresses                              5.16        21.76
Intimate                             1.11         6.28
Jackets                              0.72         3.68
Tops                                 8.24        36.35
Trend                                0.13         0.37
-------------------------------------------------------
Recommendation Indicator  Not Recommended  Recommended
Class Name                                            
Blouses                              2.51        10.69
Casual bottoms                       0.00         0.01
Chemises                             0.00         0.00
Dresses                              5.16        21.76
Fine gauge                           0.76         3.92
Intimates                            0.09         0.56
Jackets                              0.46         2.53
Jeans                                0.58         4.31
Knits                                3.76        16.87
Layering                             0.07         0.55
Legwear                              0.10         0.60
Lounge                               0.41         2.53
Outerwear                            0.26         1.14
Pants                                0.99         4.93
Shorts                               0.22         1.13
Skirts                               0.62         3.40
Sleep                                0.14         0.83
Sweaters                             1.21         4.87
Swim                                 0.29         1.20
Trend                                0.13         0.37
-------------------------------------------------------
Recommendation Indicator  Not Recommended  Recommended
Recommendation Indicator                              
Not Recommended                     17.76         0.00
Recommended                          0.00        82.24
In [70]:
for i, col in enumerate(df_num.columns):
    xtab = pd.crosstab(df_num[col], df_num["Recommendation Indicator"], normalize=True)
    print(colored('-'*55, 'red', attrs=['bold']), sep='')
    print(xtab*100)
-------------------------------------------------------
Recommendation Indicator  Not Recommended  Recommended
Age                                                   
18                                   0.00         0.02
19                                   0.02         0.15
20                                   0.06         0.40
21                                   0.04         0.40
22                                   0.06         0.56
23                                   0.16         0.89
24                                   0.17         0.87
25                                   0.27         1.14
26                                   0.33         1.47
27                                   0.25         1.21
28                                   0.44         1.38
29                                   0.48         1.71
30                                   0.36         1.37
31                                   0.52         1.90
32                                   0.45         2.24
33                                   0.69         2.40
34                                   0.61         2.81
35                                   0.79         3.08
36                                   0.63         2.96
37                                   0.68         2.58
38                                   0.70         2.62
39                                   0.69         4.72
40                                   0.52         2.11
41                                   0.51         2.64
42                                   0.56         2.21
43                                   0.51         1.96
44                                   0.47         2.15
45                                   0.39         1.86
46                                   0.69         2.35
47                                   0.44         1.96
48                                   0.49         2.18
49                                   0.33         1.76
50                                   0.29         1.41
51                                   0.31         1.36
52                                   0.29         1.60
53                                   0.45         1.94
54                                   0.24         1.44
55                                   0.22         1.18
56                                   0.33         1.68
57                                   0.27         1.28
58                                   0.15         1.03
59                                   0.19         1.18
60                                   0.22         1.23
61                                   0.13         0.83
62                                   0.19         1.25
63                                   0.17         0.85
64                                   0.18         0.87
65                                   0.10         0.86
66                                   0.13         1.04
67                                   0.14         0.53
68                                   0.09         0.52
69                                   0.07         0.41
70                                   0.06         0.33
71                                   0.04         0.18
72                                   0.03         0.17
73                                   0.03         0.14
74                                   0.03         0.18
75                                   0.04         0.07
76                                   0.01         0.03
77                                   0.02         0.06
78                                   0.01         0.05
79                                   0.00         0.06
80                                   0.00         0.04
81                                   0.01         0.01
82                                   0.01         0.05
83                                   0.00         0.18
84                                   0.00         0.02
85                                   0.00         0.02
86                                   0.00         0.01
87                                   0.00         0.02
89                                   0.00         0.02
90                                   0.00         0.00
91                                   0.01         0.01
92                                   0.00         0.00
93                                   0.00         0.01
94                                   0.00         0.01
99                                   0.00         0.01
-------------------------------------------------------
Recommendation Indicator  Not Recommended  Recommended
Rating                                                
1                                    3.52         0.07
2                                    6.26         0.40
3                                    7.16         5.06
4                                    0.72        20.90
5                                    0.11        55.80
-------------------------------------------------------
Recommendation Indicator  Not Recommended  Recommended
Positive Feedback                                     
0                                    7.25        40.33
1                                    2.93        14.29
2                                    1.69         7.64
3                                    1.16         4.94
4                                    0.87         3.05
5                                    0.72         2.14
6                                    0.50         1.73
7                                    0.36         1.23
8                                    0.25         1.11
9                                    0.25         0.86
10                                   0.24         0.72
11                                   0.15         0.60
12                                   0.13         0.49
13                                   0.14         0.29
14                                   0.16         0.36
15                                   0.13         0.27
16                                   0.07         0.25
17                                   0.12         0.23
18                                   0.07         0.20
19                                   0.03         0.20
20                                   0.06         0.11
21                                   0.04         0.09
22                                   0.03         0.10
23                                   0.03         0.10
24                                   0.02         0.07
25                                   0.03         0.07
26                                   0.04         0.06
27                                   0.03         0.06
28                                   0.02         0.09
29                                   0.01         0.06
30                                   0.02         0.06
31                                   0.01         0.06
32                                   0.01         0.03
33                                   0.00         0.03
34                                   0.01         0.02
35                                   0.01         0.01
36                                   0.01         0.02
37                                   0.02         0.02
38                                   0.02         0.02
39                                   0.01         0.01
40                                   0.00         0.01
41                                   0.00         0.01
42                                   0.03         0.01
43                                   0.01         0.00
44                                   0.01         0.00
45                                   0.00         0.02
46                                   0.00         0.01
47                                   0.00         0.01
48                                   0.00         0.00
49                                   0.01         0.00
50                                   0.00         0.00
51                                   0.01         0.01
52                                   0.00         0.00
53                                   0.00         0.01
54                                   0.00         0.00
55                                   0.01         0.00
56                                   0.00         0.00
57                                   0.00         0.01
58                                   0.00         0.01
59                                   0.00         0.00
61                                   0.00         0.00
64                                   0.00         0.00
65                                   0.00         0.01
66                                   0.00         0.00
68                                   0.00         0.00
69                                   0.00         0.00
71                                   0.00         0.00
77                                   0.00         0.00
78                                   0.00         0.00
81                                   0.00         0.02
82                                   0.00         0.00
84                                   0.00         0.00
87                                   0.00         0.00
89                                   0.00         0.00
93                                   0.00         0.00
94                                   0.00         0.00
95                                   0.00         0.00
98                                   0.00         0.00
99                                   0.00         0.00
108                                  0.00         0.00
117                                  0.00         0.00
122                                  0.00         0.00
-------------------------------------------------------
Recommendation Indicator  Not Recommended  Recommended
Recommendation Indicator                              
Not Recommended                     17.76         0.00
Recommended                          0.00        82.24
In [71]:
df.columns
Out[71]:
Index(['clothing_id', 'age', 'title', 'review_text', 'rating',
       'recommended_ind', 'positive_feedback_count', 'division_name',
       'department_name', 'class_name'],
      dtype='object')
In [72]:
df.drop(['clothing_id', 'age', 'title', 'rating',
       'positive_feedback_count', 'division_name',
       'department_name', 'class_name'], axis=1, inplace=True)
In [73]:
df.head(3)
Out[73]:
review_text recommended_ind
0 Absolutely wonderful - silky and sexy and comfortable 1
1 Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite. i bought a petite and am 5'8". i love the length... 1
2 I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i co... 0
In [74]:
df['review_text'].isnull().value_counts()
Out[74]:
review_text
False    22641
True       845
Name: count, dtype: int64
In [75]:
df['recommended_ind'].isnull().value_counts()
Out[75]:
recommended_ind
False    23486
Name: count, dtype: int64
In [76]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   review_text      22641 non-null  object
 1   recommended_ind  23486 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 367.1+ KB
In [77]:
df = df.dropna()
In [78]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 22641 entries, 0 to 23485
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   review_text      22641 non-null  object
 1   recommended_ind  22641 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 530.6+ KB
In [79]:
df['review_text'].isnull().value_counts()
Out[79]:
review_text
False    22641
Name: count, dtype: int64
In [80]:
df['recommended_ind'].isnull().value_counts()
Out[80]:
recommended_ind
False    22641
Name: count, dtype: int64
In [81]:
missing_values(df)
Out[81]:
Missing_Number Missing_Percent
In [82]:
blanks = []  # start with an empty list

for rv in df.itertuples(): # iterate over the DataFrame
    if type(rv)==str and rv.isspace(): # avoid NaN values and test 'review' for whitespace
        blanks.append(i)
blanks
Out[82]:
[]
In [83]:
df["review_text"].str.isspace().sum()
Out[83]:
0
In [84]:
df[df["review_text"].str.isspace() == True].index
Out[84]:
Index([], dtype='int64')
In [85]:
def cleaning_fsa(data):
    
    import re
    #1. Remove Puncs
    # \w typically matches [A-Za-z0-9_]
    text = re.sub('[^\w\s]','', data)
         
    #2. Tokenize
    text_tokens = word_tokenize(text.lower()) 
    
    #3. Remove numbers
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    #4. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #5. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)
In [86]:
stop_words = stopwords.words('english')
In [87]:
text = cleaning_fsa(str(df["review_text"]))
text
Out[87]:
'absolutely wonderful silky sexy comfortable love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length high hope dress really wanted work initially ordered petite small usual size found outrageously small small fact co love love love jumpsuit fun flirty fabulous every time wear get nothing great compliment shirt flattering due adjustable front tie perfect length wear legging sleeveless pair well cardigan love shirt happy snag dress great price easy slip flattering cut color combo reminds maternity clothes soft stretchy shiny material cut flattering drape nicely found one button close front looked awkward nice long sleevesnnot bu fit well top see never would worked im glad able try store didnt order online different fabric would bought dress wedding summer cute unfortunately fit isnt perfect medium fit waist perfectly way long big bust sh dress lovely platinum feminine fit perfectly easy wear comfy highly recommend name length dtype object'
In [88]:
df["review_text"] = df["review_text"].apply(cleaning_fsa)
df["review_text"].head()
Out[88]:
0                                                                                                                                                                absolutely wonderful silky sexy comfortable
1                       love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite
2    high hope dress really wanted work initially ordered petite small usual size found outrageously small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom...
3                                                                                                                   love love love jumpsuit fun flirty fabulous every time wear get nothing great compliment
4                                                                                             shirt flattering due adjustable front tie perfect length wear legging sleeveless pair well cardigan love shirt
Name: review_text, dtype: object
In [89]:
df.head(3)
Out[89]:
review_text recommended_ind
0 absolutely wonderful silky sexy comfortable 1
1 love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite 1
2 high hope dress really wanted work initially ordered petite small usual size found outrageously small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom... 0
In [90]:
" ".join(df["review_text"]).split()
Out[90]:
['absolutely',
 'wonderful',
 'silky',
 'sexy',
 'comfortable',
 'love',
 'dress',
 'sooo',
 'pretty',
 'happened',
 'find',
 'store',
 'im',
 'glad',
 'bc',
 'never',
 'would',
 'ordered',
 'online',
 'bc',
 'petite',
 'bought',
 'petite',
 'love',
 'length',
 'hit',
 'little',
 'knee',
 'would',
 'definitely',
 'true',
 'midi',
 'someone',
 'truly',
 'petite',
 'high',
 'hope',
 'dress',
 'really',
 'wanted',
 'work',
 'initially',
 'ordered',
 'petite',
 'small',
 'usual',
 'size',
 'found',
 'outrageously',
 'small',
 'small',
 'fact',
 'could',
 'zip',
 'reordered',
 'petite',
 'medium',
 'ok',
 'overall',
 'top',
 'half',
 'comfortable',
 'fit',
 'nicely',
 'bottom',
 'half',
 'tight',
 'layer',
 'several',
 'somewhat',
 'cheap',
 'net',
 'layer',
 'imo',
 'major',
 'design',
 'flaw',
 'net',
 'layer',
 'sewn',
 'directly',
 'zipper',
 'c',
 'love',
 'love',
 'love',
 'jumpsuit',
 'fun',
 'flirty',
 'fabulous',
 'every',
 'time',
 'wear',
 'get',
 'nothing',
 'great',
 'compliment',
 'shirt',
 'flattering',
 'due',
 'adjustable',
 'front',
 'tie',
 'perfect',
 'length',
 'wear',
 'legging',
 'sleeveless',
 'pair',
 'well',
 'cardigan',
 'love',
 'shirt',
 'love',
 'tracy',
 'reese',
 'dress',
 'one',
 'petite',
 'foot',
 'tall',
 'usually',
 'wear',
 'brand',
 'dress',
 'pretty',
 'package',
 'lot',
 'dress',
 'skirt',
 'long',
 'full',
 'overwhelmed',
 'small',
 'frame',
 'stranger',
 'alteration',
 'shortening',
 'narrowing',
 'skirt',
 'would',
 'take',
 'away',
 'embellishment',
 'garment',
 'love',
 'color',
 'idea',
 'style',
 'work',
 'returned',
 'dress',
 'aded',
 'basket',
 'hte',
 'last',
 'mintue',
 'see',
 'would',
 'look',
 'like',
 'person',
 'store',
 'pick',
 'went',
 'teh',
 'darkler',
 'color',
 'pale',
 'hte',
 'color',
 'really',
 'gorgeous',
 'turn',
 'mathced',
 'everythiing',
 'trying',
 'prefectly',
 'little',
 'baggy',
 'hte',
 'x',
 'hte',
 'msallet',
 'size',
 'bummer',
 'petite',
 'decided',
 'jkeep',
 'though',
 'said',
 'matvehd',
 'everything',
 'ejans',
 'pant',
 'skirt',
 'waas',
 'trying',
 'kept',
 'oops',
 'ordered',
 'carbon',
 'store',
 'pick',
 'ton',
 'stuff',
 'always',
 'try',
 'used',
 'top',
 'pair',
 'skirt',
 'pant',
 'everything',
 'went',
 'color',
 'really',
 'nice',
 'charcoal',
 'shimmer',
 'went',
 'well',
 'pencil',
 'skirt',
 'flare',
 'pant',
 'etc',
 'compaint',
 'bit',
 'big',
 'sleeve',
 'long',
 'doesnt',
 'go',
 'petite',
 'also',
 'bit',
 'loose',
 'xx',
 'kept',
 'wil',
 'ldecide',
 'later',
 'since',
 'light',
 'color',
 'already',
 'sold',
 'hte',
 'smallest',
 'size',
 'love',
 'dress',
 'usually',
 'get',
 'x',
 'run',
 'little',
 'snug',
 'bust',
 'ordered',
 'size',
 'flattering',
 'feminine',
 'usual',
 'retailer',
 'flair',
 'style',
 'im',
 'lb',
 'ordered',
 'petite',
 'make',
 'sure',
 'length',
 'wasnt',
 'long',
 'typically',
 'wear',
 'x',
 'regular',
 'retailer',
 'dress',
 'youre',
 'le',
 'busty',
 'cup',
 'smaller',
 'petite',
 'fit',
 'perfectly',
 'snug',
 'tight',
 'love',
 'could',
 'dress',
 'party',
 'work',
 'love',
 'tulle',
 'longer',
 'fabric',
 'underneath',
 'dress',
 'run',
 'small',
 'esp',
 'zipper',
 'area',
 'run',
 'ordered',
 'sp',
 'typically',
 'fit',
 'tight',
 'material',
 'top',
 'look',
 'feel',
 'cheap',
 'even',
 'pulling',
 'cause',
 'rip',
 'fabric',
 'pretty',
 'disappointed',
 'going',
 'christmas',
 'dress',
 'year',
 'needle',
 'say',
 'going',
 'back',
 'dress',
 'perfection',
 'pretty',
 'flattering',
 'find',
 'reliant',
 'review',
 'written',
 'savvy',
 'shopper',
 'past',
 'right',
 'estimation',
 'product',
 'case',
 'dressif',
 'reveiwsi',
 'doubt',
 'would',
 'even',
 'tried',
 'dress',
 'beautifully',
 'made',
 'lined',
 'reminiscent',
 'old',
 'retailer',
 'quality',
 'lined',
 'solid',
 'periwinklecolored',
 'fabric',
 'match',
 'outer',
 'fabric',
 'print',
 'tt',
 'formfitting',
 'fall',
 'knee',
 'rid',
 'bought',
 'black',
 'x',
 'go',
 'larkspur',
 'midi',
 'dress',
 'didnt',
 'bother',
 'lining',
 'skirt',
 'portion',
 'grrrrrrrrrrr',
 'stats',
 'x',
 'fit',
 'smoothly',
 'around',
 'chest',
 'flowy',
 'around',
 'lower',
 'half',
 'would',
 'say',
 'running',
 'big',
 'strap',
 'pretty',
 'could',
 'easily',
 'nightwear',
 'im',
 'came',
 'knee',
 'nice',
 'choice',
 'holiday',
 'gathering',
 'like',
 'length',
 'graz',
 'knee',
 'conservative',
 'enough',
 'office',
 'related',
 'gathering',
 'size',
 'small',
 'fit',
 'well',
 'usually',
 'size',
 'small',
 'bust',
 'opinion',
 'run',
 'small',
 'larger',
 'bust',
 'definitely',
 'size',
 'perhaps',
 'waist',
 'big',
 'problem',
 'dress',
 'quality',
 'fabric',
 'terrible',
 'delicate',
 'netting',
 'type',
 'fabric',
 'top',
 'layer',
 'skirt',
 'got',
 'stuck',
 'zip',
 'took',
 'package',
 'wanted',
 'fit',
 'badly',
 'could',
 'tell',
 'put',
 'wouldnt',
 'hourglass',
 'figure',
 'straight',
 'waist',
 'way',
 'small',
 'body',
 'shape',
 'even',
 'sized',
 'could',
 'tell',
 'would',
 'still',
 'tight',
 'waist',
 'roomy',
 'hip',
 'said',
 'really',
 'nice',
 'sturdy',
 'linenlike',
 'fabric',
 'pretty',
 'color',
 'well',
 'made',
 'hope',
 'make',
 'someone',
 'happy',
 'material',
 'color',
 'nice',
 'leg',
 'opening',
 'large',
 'length',
 'hit',
 'right',
 'ankle',
 'leg',
 'opening',
 'size',
 'waist',
 'hem',
 'line',
 'ankle',
 'front',
 'pleat',
 'make',
 'fluffy',
 'think',
 'imagine',
 'flattering',
 'look',
 'least',
 'average',
 'height',
 'taller',
 'may',
 'look',
 'good',
 'took',
 'chance',
 'blouse',
 'glad',
 'wasnt',
 'crazy',
 'blouse',
 'photographed',
 'model',
 'paired',
 'whit',
 'white',
 'pant',
 'worked',
 'perfectly',
 'crisp',
 'clean',
 'would',
 'describe',
 'launders',
 'well',
 'fit',
 'great',
 'drape',
 'perfect',
 'wear',
 'tucked',
 'cant',
 'go',
 'wrong',
 'flattering',
 'super',
 'cozy',
 'coat',
 'work',
 'well',
 'cold',
 'dry',
 'day',
 'look',
 'good',
 'jean',
 'dressier',
 'outfit',
 'small',
 'fit',
 'great',
 'love',
 'look',
 'feel',
 'tulle',
 'dress',
 'looking',
 'something',
 'different',
 'top',
 'new',
 'year',
 'eve',
 'im',
 'small',
 'chested',
 'top',
 'dress',
 'form',
 'fitting',
 'flattering',
 'look',
 'steamed',
 'tulle',
 'perfect',
 'ordered',
 'xsp',
 'length',
 'perfect',
 'product',
 'petite',
 'would',
 'get',
 'petite',
 'regular',
 'little',
 'long',
 'tailor',
 'simple',
 'fix',
 'fit',
 'nicely',
 'im',
 'pregnant',
 'bough',
 'medium',
 'grow',
 'tie',
 'front',
 'back',
 'provides',
 'nice',
 'flexibility',
 'form',
 'fitting',
 'im',
 'upset',
 'price',
 'dress',
 'thought',
 'embroidered',
 'print',
 'fabric',
 'think',
 'cried',
 'little',
 'opened',
 'box',
 'still',
 'ver',
 'pretty',
 'would',
 'say',
 'true',
 'size',
 'tad',
 'bit',
 'big',
 'tiny',
 'still',
 'get',
 'away',
 'color',
 'vibrant',
 'style',
 'unique',
 'skirt',
 'portion',
 'pretty',
 'poofy',
 'keep',
 'going',
 'back',
 'forth',
 'mainly',
 'price',
 'although',
 'quality',
 'definitely',
 'except',
 'wish',
 'emb',
 'first',
 'pullover',
 'styling',
 'side',
 'zipper',
 'wouldnt',
 'purchased',
 'knew',
 'side',
 'zipper',
 'large',
 'bust',
 'side',
 'zipper',
 'next',
 'impossible',
 'second',
 'tulle',
 'feel',
 'look',
 'cheap',
 'slip',
 'awkward',
 'tight',
 'shape',
 'underneath',
 'look',
 'like',
 'described',
 'sadly',
 'returning',
 'im',
 'sure',
 'find',
 'something',
 'exchange',
 'cute',
 'little',
 'dress',
 'fit',
 'tt',
 'little',
 'high',
 'waisted',
 'good',
 'length',
 'height',
 'like',
 'dress',
 'im',
 'love',
 'dont',
 'think',
 'look',
 'feel',
 'cheap',
 'appears',
 'pictured',
 'love',
 'shirt',
 'first',
 'saw',
 'wasnt',
 'sure',
 'shirt',
 'dress',
 'since',
 'seethrough',
 'wear',
 'like',
 'dress',
 'need',
 'slip',
 'wear',
 'legging',
 'bought',
 'slip',
 'wore',
 'tie',
 'back',
 'rocked',
 'white',
 'wedge',
 'could',
 'also',
 'wear',
 'vest',
 'careful',
 'button',
 'havent',
 'fall',
 'yet',
 'feel',
 'like',
 'overall',
 'great',
 'occasion',
 'fun',
 'wear',
 'loved',
 'material',
 'didnt',
 'really',
 'look',
 'long',
 'dress',
 'purchased',
 'large',
 'medium',
 'im',
 'atleast',
 'material',
 'foot',
 'gap',
 'front',
 'much',
 'wider',
 'look',
 'felt',
 'like',
 'dress',
 'fell',
 'flat',
 'returned',
 'im',
 'usually',
 'large',
 'med',
 'fit',
 'better',
 'jean',
 'waiting',
 'sweater',
 'coat',
 'ship',
 'week',
 'excited',
 'arrive',
 'coat',
 'true',
 'size',
 'made',
 'look',
 'short',
 'squat',
 'sleeve',
 'wide',
 'although',
 'long',
 'light',
 'weight',
 'fall',
 'coat',
 'sleeve',
 'dont',
 'need',
 'wide',
 'wouldnt',
 'layerng',
 'much',
 'underneath',
 'button',
 'need',
 'moved',
 'least',
 'three',
 'inch',
 'nicer',
 'fit',
 'thought',
 'redoing',
 'button',
 'sleeve',
 'looked',
 'even',
 'proportion',
 'tigh',
 'color',
 'werent',
 'expected',
 'either',
 'dark',
 'blue',
 'much',
 'vibrant',
 'couldnt',
 'find',
 'anything',
 'really',
 'go',
 'fabric',
 'thick',
 'good',
 'quality',
 'nice',
 'weight',
 'movement',
 'skirt',
 'wasnt',
 'end',
 'several',
 'goodhyouman',
 'shirt',
 'get',
 'many',
 'compliment',
 'especially',
 'one',
 'say',
 'forehead',
 'kiss',
 'underrated',
 'dont',
 'hesitate',
 'buy',
 'shirt',
 'wont',
 'sorry',
 'sweater',
 'comfy',
 'classic',
 'balance',
 'quirky',
 'handknit',
 'look',
 'beautiful',
 'color',
 'practical',
 'fit',
 'bit',
 'cropped',
 'boxy',
 'part',
 'style',
 'others',
 'mentioned',
 'gap',
 'knit',
 'make',
 'seethrough',
 'opinion',
 'make',
 'perfect',
 'layering',
 'like',
 'longer',
 'camisole',
 'showing',
 'underneath',
 'wearing',
 'little',
 'dress',
 'warm',
 'still',
 'thin',
 'enough',
 'fit',
 'jacket',
 'coat',
 'beautifully',
 'made',
 'pant',
 'trend',
 'flared',
 'crop',
 'much',
 'cuter',
 'person',
 'love',
 'never',
 'would',
 'given',
 'pant',
 'second',
 'look',
 'online',
 'person',
 'much',
 'cuter',
 'stripe',
 'brighter',
 'fit',
 'flattering',
 'crop',
 'cute',
 'flare',
 'right',
 'trend',
 'brand',
 'always',
 'run',
 'small',
 'carry',
 'chubbiness',
 'belly',
 'paired',
 'collarless',
 'loose',
 'navy',
 'blazer',
 'pant',
 'even',
 'better',
 'person',
 'downside',
 'need',
 'dry',
 'cleaned',
 'ordered',
 'month',
 'ago',
 'finally',
 'came',
 'back',
 'order',
 'huge',
 'disappointment',
 'fit',
 'much',
 'issue',
 'quality',
 ...]
In [91]:
word_values = pd.Series(" ".join(df["review_text"]).split()).value_counts()
word_values
Out[91]:
dress       11319
fit         10091
size         9349
love         8968
top          8256
            ...  
takeout         1
teenybop        1
hugely          1
shirred         1
platinum        1
Name: count, Length: 16758, dtype: int64
In [92]:
rare_words = word_values[word_values <= 2]
rare_words
Out[92]:
wristforearm    2
magnolia        2
smalltight      2
urban           2
underrated      2
               ..
takeout         1
teenybop        1
hugely          1
shirred         1
platinum        1
Name: count, Length: 10935, dtype: int64
In [93]:
rare_words.value_counts()
Out[93]:
count
1    9058
2    1877
Name: count, dtype: int64
In [94]:
len(rare_words)
Out[94]:
10935
In [95]:
rare_words.index
Out[95]:
Index(['wristforearm', 'magnolia', 'smalltight', 'urban', 'underrated',
       'henleys', 'greenyellow', 'outrageous', 'blueteal', 'status',
       ...
       'quaiity', 'befo', 'slam', 'brokenin', 'accomadate', 'takeout',
       'teenybop', 'hugely', 'shirred', 'platinum'],
      dtype='object', length=10935)
In [96]:
df["review_text"] = df["review_text"].apply(lambda x: " ".join([i for i in x.split() if i not in rare_words.index]))
df["review_text"].head()
Out[96]:
0                                                                                                                                                                absolutely wonderful silky sexy comfortable
1                       love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite
2    high hope dress really wanted work initially ordered petite small usual size found small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom half tight l...
3                                                                                                                   love love love jumpsuit fun flirty fabulous every time wear get nothing great compliment
4                                                                                             shirt flattering due adjustable front tie perfect length wear legging sleeveless pair well cardigan love shirt
Name: review_text, dtype: object
In [97]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 22641 entries, 0 to 23485
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   review_text      22641 non-null  object
 1   recommended_ind  22641 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 530.6+ KB
In [98]:
df.head(3)
Out[98]:
review_text recommended_ind
0 absolutely wonderful silky sexy comfortable 1
1 love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite 1
2 high hope dress really wanted work initially ordered petite small usual size found small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom half tight l... 0
In [99]:
df.columns
Out[99]:
Index(['review_text', 'recommended_ind'], dtype='object')
In [100]:
df[df["recommended_ind"] == 0]
Out[100]:
review_text recommended_ind
2 high hope dress really wanted work initially ordered petite small usual size found small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom half tight l... 0
5 love tracy reese dress one petite foot tall usually wear brand dress pretty package lot dress skirt long full overwhelmed small frame stranger alteration shortening skirt would take away embellish... 0
10 dress run small esp zipper area run ordered sp typically fit tight material top look feel cheap even pulling cause rip fabric pretty disappointed going christmas dress year needle say going back 0
22 first pullover styling side zipper wouldnt purchased knew side zipper large bust side zipper next impossible second tulle feel look cheap slip awkward tight shape underneath look like described sa... 0
25 loved material didnt really look long dress purchased large medium im atleast material foot gap front much wider look felt like dress fell flat returned im usually large med fit better jean 0
... ... ...
23449 tried color really pretty fun want pair pant looked awful muscular thigh dont think cut falttering also comfortable looked crotch seemed little low though pant correct size 0
23450 wanted love jacket soft great color unfortunately light weight almost like shirt fabric nice heft fabric though nice length accept sleeve short small right size looking something slightly warmer l... 0
23460 purchased good price typically love maeve winwin fabric thin slinky unfortunate way made finding appropriate undergarment difficult add slip helped figured since im losing weight would look better... 0
23478 surprised positive review product terrible cut weird place make look wide skirt also like picture darker heavier material isnt great return 0
23483 fit well top see never would worked im glad able try store didnt order online different fabric would great 0

4101 rows × 2 columns

In [101]:
df[df["recommended_ind"] == 1]
Out[101]:
review_text recommended_ind
0 absolutely wonderful silky sexy comfortable 1
1 love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite 1
3 love love love jumpsuit fun flirty fabulous every time wear get nothing great compliment 1
4 shirt flattering due adjustable front tie perfect length wear legging sleeveless pair well cardigan love shirt 1
6 basket hte last see would look like person store pick went teh color pale hte color really gorgeous turn trying little baggy hte x hte size bummer petite decided though said everything pant skirt ... 1
... ... ...
23479 wasnt sure ordering skirt couldnt see person first im glad skirt design well made doesnt look feel cheap color isnt red worked pale skin better anticipated wore daughter dance production tall boot... 1
23481 happy snag dress great price easy slip flattering cut color combo 1
23482 reminds maternity clothes soft stretchy shiny material cut flattering drape nicely found one button close front looked awkward nice long sleeve maybe others ok 1
23484 bought dress wedding summer cute unfortunately fit isnt perfect medium fit waist perfectly way long big bust shoulder wanted spend money could get tailored felt like might worth side note dress de... 1
23485 dress lovely feminine fit perfectly easy wear comfy highly recommend 1

18540 rows × 2 columns

In [102]:
df["recommended_ind"].value_counts()
Out[102]:
recommended_ind
1    18540
0     4101
Name: count, dtype: int64
In [103]:
" ".join(df["review_text"]).split()
Out[103]:
['absolutely',
 'wonderful',
 'silky',
 'sexy',
 'comfortable',
 'love',
 'dress',
 'sooo',
 'pretty',
 'happened',
 'find',
 'store',
 'im',
 'glad',
 'bc',
 'never',
 'would',
 'ordered',
 'online',
 'bc',
 'petite',
 'bought',
 'petite',
 'love',
 'length',
 'hit',
 'little',
 'knee',
 'would',
 'definitely',
 'true',
 'midi',
 'someone',
 'truly',
 'petite',
 'high',
 'hope',
 'dress',
 'really',
 'wanted',
 'work',
 'initially',
 'ordered',
 'petite',
 'small',
 'usual',
 'size',
 'found',
 'small',
 'small',
 'fact',
 'could',
 'zip',
 'reordered',
 'petite',
 'medium',
 'ok',
 'overall',
 'top',
 'half',
 'comfortable',
 'fit',
 'nicely',
 'bottom',
 'half',
 'tight',
 'layer',
 'several',
 'somewhat',
 'cheap',
 'net',
 'layer',
 'imo',
 'major',
 'design',
 'flaw',
 'net',
 'layer',
 'sewn',
 'directly',
 'zipper',
 'c',
 'love',
 'love',
 'love',
 'jumpsuit',
 'fun',
 'flirty',
 'fabulous',
 'every',
 'time',
 'wear',
 'get',
 'nothing',
 'great',
 'compliment',
 'shirt',
 'flattering',
 'due',
 'adjustable',
 'front',
 'tie',
 'perfect',
 'length',
 'wear',
 'legging',
 'sleeveless',
 'pair',
 'well',
 'cardigan',
 'love',
 'shirt',
 'love',
 'tracy',
 'reese',
 'dress',
 'one',
 'petite',
 'foot',
 'tall',
 'usually',
 'wear',
 'brand',
 'dress',
 'pretty',
 'package',
 'lot',
 'dress',
 'skirt',
 'long',
 'full',
 'overwhelmed',
 'small',
 'frame',
 'stranger',
 'alteration',
 'shortening',
 'skirt',
 'would',
 'take',
 'away',
 'embellishment',
 'garment',
 'love',
 'color',
 'idea',
 'style',
 'work',
 'returned',
 'dress',
 'basket',
 'hte',
 'last',
 'see',
 'would',
 'look',
 'like',
 'person',
 'store',
 'pick',
 'went',
 'teh',
 'color',
 'pale',
 'hte',
 'color',
 'really',
 'gorgeous',
 'turn',
 'trying',
 'little',
 'baggy',
 'hte',
 'x',
 'hte',
 'size',
 'bummer',
 'petite',
 'decided',
 'though',
 'said',
 'everything',
 'pant',
 'skirt',
 'trying',
 'kept',
 'oops',
 'ordered',
 'carbon',
 'store',
 'pick',
 'ton',
 'stuff',
 'always',
 'try',
 'used',
 'top',
 'pair',
 'skirt',
 'pant',
 'everything',
 'went',
 'color',
 'really',
 'nice',
 'charcoal',
 'shimmer',
 'went',
 'well',
 'pencil',
 'skirt',
 'flare',
 'pant',
 'etc',
 'bit',
 'big',
 'sleeve',
 'long',
 'doesnt',
 'go',
 'petite',
 'also',
 'bit',
 'loose',
 'xx',
 'kept',
 'wil',
 'later',
 'since',
 'light',
 'color',
 'already',
 'sold',
 'hte',
 'smallest',
 'size',
 'love',
 'dress',
 'usually',
 'get',
 'x',
 'run',
 'little',
 'snug',
 'bust',
 'ordered',
 'size',
 'flattering',
 'feminine',
 'usual',
 'retailer',
 'flair',
 'style',
 'im',
 'lb',
 'ordered',
 'petite',
 'make',
 'sure',
 'length',
 'wasnt',
 'long',
 'typically',
 'wear',
 'x',
 'regular',
 'retailer',
 'dress',
 'youre',
 'le',
 'busty',
 'cup',
 'smaller',
 'petite',
 'fit',
 'perfectly',
 'snug',
 'tight',
 'love',
 'could',
 'dress',
 'party',
 'work',
 'love',
 'tulle',
 'longer',
 'fabric',
 'underneath',
 'dress',
 'run',
 'small',
 'esp',
 'zipper',
 'area',
 'run',
 'ordered',
 'sp',
 'typically',
 'fit',
 'tight',
 'material',
 'top',
 'look',
 'feel',
 'cheap',
 'even',
 'pulling',
 'cause',
 'rip',
 'fabric',
 'pretty',
 'disappointed',
 'going',
 'christmas',
 'dress',
 'year',
 'needle',
 'say',
 'going',
 'back',
 'dress',
 'perfection',
 'pretty',
 'flattering',
 'find',
 'review',
 'written',
 'savvy',
 'shopper',
 'past',
 'right',
 'product',
 'case',
 'doubt',
 'would',
 'even',
 'tried',
 'dress',
 'beautifully',
 'made',
 'lined',
 'reminiscent',
 'old',
 'retailer',
 'quality',
 'lined',
 'solid',
 'fabric',
 'match',
 'outer',
 'fabric',
 'print',
 'tt',
 'formfitting',
 'fall',
 'knee',
 'rid',
 'bought',
 'black',
 'x',
 'go',
 'midi',
 'dress',
 'didnt',
 'bother',
 'lining',
 'skirt',
 'portion',
 'stats',
 'x',
 'fit',
 'smoothly',
 'around',
 'chest',
 'flowy',
 'around',
 'lower',
 'half',
 'would',
 'say',
 'running',
 'big',
 'strap',
 'pretty',
 'could',
 'easily',
 'im',
 'came',
 'knee',
 'nice',
 'choice',
 'holiday',
 'gathering',
 'like',
 'length',
 'knee',
 'conservative',
 'enough',
 'office',
 'related',
 'gathering',
 'size',
 'small',
 'fit',
 'well',
 'usually',
 'size',
 'small',
 'bust',
 'opinion',
 'run',
 'small',
 'larger',
 'bust',
 'definitely',
 'size',
 'perhaps',
 'waist',
 'big',
 'problem',
 'dress',
 'quality',
 'fabric',
 'terrible',
 'delicate',
 'netting',
 'type',
 'fabric',
 'top',
 'layer',
 'skirt',
 'got',
 'stuck',
 'zip',
 'took',
 'package',
 'wanted',
 'fit',
 'badly',
 'could',
 'tell',
 'put',
 'wouldnt',
 'hourglass',
 'figure',
 'straight',
 'waist',
 'way',
 'small',
 'body',
 'shape',
 'even',
 'sized',
 'could',
 'tell',
 'would',
 'still',
 'tight',
 'waist',
 'roomy',
 'hip',
 'said',
 'really',
 'nice',
 'sturdy',
 'linenlike',
 'fabric',
 'pretty',
 'color',
 'well',
 'made',
 'hope',
 'make',
 'someone',
 'happy',
 'material',
 'color',
 'nice',
 'leg',
 'opening',
 'large',
 'length',
 'hit',
 'right',
 'ankle',
 'leg',
 'opening',
 'size',
 'waist',
 'hem',
 'line',
 'ankle',
 'front',
 'pleat',
 'make',
 'fluffy',
 'think',
 'imagine',
 'flattering',
 'look',
 'least',
 'average',
 'height',
 'taller',
 'may',
 'look',
 'good',
 'took',
 'chance',
 'blouse',
 'glad',
 'wasnt',
 'crazy',
 'blouse',
 'photographed',
 'model',
 'paired',
 'whit',
 'white',
 'pant',
 'worked',
 'perfectly',
 'crisp',
 'clean',
 'would',
 'describe',
 'launders',
 'well',
 'fit',
 'great',
 'drape',
 'perfect',
 'wear',
 'tucked',
 'cant',
 'go',
 'wrong',
 'flattering',
 'super',
 'cozy',
 'coat',
 'work',
 'well',
 'cold',
 'dry',
 'day',
 'look',
 'good',
 'jean',
 'dressier',
 'outfit',
 'small',
 'fit',
 'great',
 'love',
 'look',
 'feel',
 'tulle',
 'dress',
 'looking',
 'something',
 'different',
 'top',
 'new',
 'year',
 'eve',
 'im',
 'small',
 'chested',
 'top',
 'dress',
 'form',
 'fitting',
 'flattering',
 'look',
 'steamed',
 'tulle',
 'perfect',
 'ordered',
 'xsp',
 'length',
 'perfect',
 'product',
 'petite',
 'would',
 'get',
 'petite',
 'regular',
 'little',
 'long',
 'tailor',
 'simple',
 'fix',
 'fit',
 'nicely',
 'im',
 'pregnant',
 'bough',
 'medium',
 'grow',
 'tie',
 'front',
 'back',
 'provides',
 'nice',
 'flexibility',
 'form',
 'fitting',
 'im',
 'upset',
 'price',
 'dress',
 'thought',
 'embroidered',
 'print',
 'fabric',
 'think',
 'little',
 'opened',
 'box',
 'still',
 'ver',
 'pretty',
 'would',
 'say',
 'true',
 'size',
 'tad',
 'bit',
 'big',
 'tiny',
 'still',
 'get',
 'away',
 'color',
 'vibrant',
 'style',
 'unique',
 'skirt',
 'portion',
 'pretty',
 'poofy',
 'keep',
 'going',
 'back',
 'forth',
 'mainly',
 'price',
 'although',
 'quality',
 'definitely',
 'except',
 'wish',
 'first',
 'pullover',
 'styling',
 'side',
 'zipper',
 'wouldnt',
 'purchased',
 'knew',
 'side',
 'zipper',
 'large',
 'bust',
 'side',
 'zipper',
 'next',
 'impossible',
 'second',
 'tulle',
 'feel',
 'look',
 'cheap',
 'slip',
 'awkward',
 'tight',
 'shape',
 'underneath',
 'look',
 'like',
 'described',
 'sadly',
 'returning',
 'im',
 'sure',
 'find',
 'something',
 'exchange',
 'cute',
 'little',
 'dress',
 'fit',
 'tt',
 'little',
 'high',
 'waisted',
 'good',
 'length',
 'height',
 'like',
 'dress',
 'im',
 'love',
 'dont',
 'think',
 'look',
 'feel',
 'cheap',
 'appears',
 'pictured',
 'love',
 'shirt',
 'first',
 'saw',
 'wasnt',
 'sure',
 'shirt',
 'dress',
 'since',
 'seethrough',
 'wear',
 'like',
 'dress',
 'need',
 'slip',
 'wear',
 'legging',
 'bought',
 'slip',
 'wore',
 'tie',
 'back',
 'white',
 'wedge',
 'could',
 'also',
 'wear',
 'vest',
 'careful',
 'button',
 'havent',
 'fall',
 'yet',
 'feel',
 'like',
 'overall',
 'great',
 'occasion',
 'fun',
 'wear',
 'loved',
 'material',
 'didnt',
 'really',
 'look',
 'long',
 'dress',
 'purchased',
 'large',
 'medium',
 'im',
 'atleast',
 'material',
 'foot',
 'gap',
 'front',
 'much',
 'wider',
 'look',
 'felt',
 'like',
 'dress',
 'fell',
 'flat',
 'returned',
 'im',
 'usually',
 'large',
 'med',
 'fit',
 'better',
 'jean',
 'waiting',
 'sweater',
 'coat',
 'ship',
 'week',
 'excited',
 'arrive',
 'coat',
 'true',
 'size',
 'made',
 'look',
 'short',
 'squat',
 'sleeve',
 'wide',
 'although',
 'long',
 'light',
 'weight',
 'fall',
 'coat',
 'sleeve',
 'dont',
 'need',
 'wide',
 'wouldnt',
 'much',
 'underneath',
 'button',
 'need',
 'moved',
 'least',
 'three',
 'inch',
 'nicer',
 'fit',
 'thought',
 'button',
 'sleeve',
 'looked',
 'even',
 'proportion',
 'tigh',
 'color',
 'werent',
 'expected',
 'either',
 'dark',
 'blue',
 'much',
 'vibrant',
 'couldnt',
 'find',
 'anything',
 'really',
 'go',
 'fabric',
 'thick',
 'good',
 'quality',
 'nice',
 'weight',
 'movement',
 'skirt',
 'wasnt',
 'end',
 'several',
 'shirt',
 'get',
 'many',
 'compliment',
 'especially',
 'one',
 'say',
 'dont',
 'hesitate',
 'buy',
 'shirt',
 'wont',
 'sorry',
 'sweater',
 'comfy',
 'classic',
 'balance',
 'quirky',
 'handknit',
 'look',
 'beautiful',
 'color',
 'practical',
 'fit',
 'bit',
 'cropped',
 'boxy',
 'part',
 'style',
 'others',
 'mentioned',
 'gap',
 'knit',
 'make',
 'seethrough',
 'opinion',
 'make',
 'perfect',
 'layering',
 'like',
 'longer',
 'camisole',
 'showing',
 'underneath',
 'wearing',
 'little',
 'dress',
 'warm',
 'still',
 'thin',
 'enough',
 'fit',
 'jacket',
 'coat',
 'beautifully',
 'made',
 'pant',
 'trend',
 'flared',
 'crop',
 'much',
 'cuter',
 'person',
 'love',
 'never',
 'would',
 'given',
 'pant',
 'second',
 'look',
 'online',
 'person',
 'much',
 'cuter',
 'stripe',
 'brighter',
 'fit',
 'flattering',
 'crop',
 'cute',
 'flare',
 'right',
 'trend',
 'brand',
 'always',
 'run',
 'small',
 'carry',
 'belly',
 'paired',
 'loose',
 'navy',
 'blazer',
 'pant',
 'even',
 'better',
 'person',
 'downside',
 'need',
 'dry',
 'cleaned',
 'ordered',
 'month',
 'ago',
 'finally',
 'came',
 'back',
 'order',
 'huge',
 'disappointment',
 'fit',
 'much',
 'issue',
 'quality',
 'wool',
 'subpar',
 'someone',
 'else',
 'mentioned',
 'guess',
 'call',
 'literally',
 'feel',
 'like',
 'felt',
 'super',
 'thin',
 'itchy',
 'drape',
 'well',
 'feel',
 'cheap',
 'made',
 'china',
 'got',
 'sale',
 'still',
 'worth',
 'paid',
 'definitely',
 'going',
 'back',
 'neat',
 'dress',
 'color',
 'great',
 'fabric',
 'super',
 'soft',
 ...]
In [104]:
neg_words = " ".join(df[df["recommended_ind"] == 0].review_text).split()
neg_words 
Out[104]:
['high',
 'hope',
 'dress',
 'really',
 'wanted',
 'work',
 'initially',
 'ordered',
 'petite',
 'small',
 'usual',
 'size',
 'found',
 'small',
 'small',
 'fact',
 'could',
 'zip',
 'reordered',
 'petite',
 'medium',
 'ok',
 'overall',
 'top',
 'half',
 'comfortable',
 'fit',
 'nicely',
 'bottom',
 'half',
 'tight',
 'layer',
 'several',
 'somewhat',
 'cheap',
 'net',
 'layer',
 'imo',
 'major',
 'design',
 'flaw',
 'net',
 'layer',
 'sewn',
 'directly',
 'zipper',
 'c',
 'love',
 'tracy',
 'reese',
 'dress',
 'one',
 'petite',
 'foot',
 'tall',
 'usually',
 'wear',
 'brand',
 'dress',
 'pretty',
 'package',
 'lot',
 'dress',
 'skirt',
 'long',
 'full',
 'overwhelmed',
 'small',
 'frame',
 'stranger',
 'alteration',
 'shortening',
 'skirt',
 'would',
 'take',
 'away',
 'embellishment',
 'garment',
 'love',
 'color',
 'idea',
 'style',
 'work',
 'returned',
 'dress',
 'dress',
 'run',
 'small',
 'esp',
 'zipper',
 'area',
 'run',
 'ordered',
 'sp',
 'typically',
 'fit',
 'tight',
 'material',
 'top',
 'look',
 'feel',
 'cheap',
 'even',
 'pulling',
 'cause',
 'rip',
 'fabric',
 'pretty',
 'disappointed',
 'going',
 'christmas',
 'dress',
 'year',
 'needle',
 'say',
 'going',
 'back',
 'first',
 'pullover',
 'styling',
 'side',
 'zipper',
 'wouldnt',
 'purchased',
 'knew',
 'side',
 'zipper',
 'large',
 'bust',
 'side',
 'zipper',
 'next',
 'impossible',
 'second',
 'tulle',
 'feel',
 'look',
 'cheap',
 'slip',
 'awkward',
 'tight',
 'shape',
 'underneath',
 'look',
 'like',
 'described',
 'sadly',
 'returning',
 'im',
 'sure',
 'find',
 'something',
 'exchange',
 'loved',
 'material',
 'didnt',
 'really',
 'look',
 'long',
 'dress',
 'purchased',
 'large',
 'medium',
 'im',
 'atleast',
 'material',
 'foot',
 'gap',
 'front',
 'much',
 'wider',
 'look',
 'felt',
 'like',
 'dress',
 'fell',
 'flat',
 'returned',
 'im',
 'usually',
 'large',
 'med',
 'fit',
 'better',
 'jean',
 'waiting',
 'sweater',
 'coat',
 'ship',
 'week',
 'excited',
 'arrive',
 'coat',
 'true',
 'size',
 'made',
 'look',
 'short',
 'squat',
 'sleeve',
 'wide',
 'although',
 'long',
 'light',
 'weight',
 'fall',
 'coat',
 'sleeve',
 'dont',
 'need',
 'wide',
 'wouldnt',
 'much',
 'underneath',
 'button',
 'need',
 'moved',
 'least',
 'three',
 'inch',
 'nicer',
 'fit',
 'thought',
 'button',
 'sleeve',
 'looked',
 'even',
 'proportion',
 'tigh',
 'ordered',
 'month',
 'ago',
 'finally',
 'came',
 'back',
 'order',
 'huge',
 'disappointment',
 'fit',
 'much',
 'issue',
 'quality',
 'wool',
 'subpar',
 'someone',
 'else',
 'mentioned',
 'guess',
 'call',
 'literally',
 'feel',
 'like',
 'felt',
 'super',
 'thin',
 'itchy',
 'drape',
 'well',
 'feel',
 'cheap',
 'made',
 'china',
 'got',
 'sale',
 'still',
 'worth',
 'paid',
 'definitely',
 'going',
 'back',
 'pregnant',
 'thought',
 'would',
 'great',
 'sleep',
 'bra',
 'soft',
 'fit',
 'okay',
 'zero',
 'support',
 'shape',
 'would',
 'buy',
 'b',
 'cup',
 'smaller',
 'get',
 'away',
 'without',
 'support',
 'would',
 'seen',
 'store',
 'would',
 'passed',
 'however',
 'lazy',
 'return',
 'wearing',
 'comfortable',
 'thats',
 'redeeming',
 'quality',
 'would',
 'recommend',
 'larger',
 'chested',
 'lady',
 'though',
 'tank',
 'fit',
 'well',
 'loved',
 'ruffle',
 'back',
 'layed',
 'front',
 'good',
 'look',
 'retuning',
 'tag',
 'sewn',
 'small',
 'long',
 'huge',
 'x',
 'itchy',
 'cut',
 'thread',
 'left',
 'behind',
 'plasticy',
 'even',
 'itchy',
 'make',
 'item',
 'itchy',
 'tag',
 'comfortable',
 'also',
 'love',
 'bralettes',
 'wear',
 'time',
 'including',
 'work',
 'b',
 'cup',
 'however',
 'one',
 'thin',
 'flimsy',
 'give',
 'support',
 'even',
 'b',
 'cup',
 'would',
 'lounging',
 'bralette',
 'wasnt',
 'itchy',
 'really',
 'loved',
 'top',
 'online',
 'wanted',
 'love',
 'person',
 'soft',
 'patter',
 'okay',
 'person',
 'neckline',
 'higher',
 'used',
 'also',
 'two',
 'button',
 'back',
 'must',
 'unbuttoned',
 'order',
 'wear',
 'top',
 'difficult',
 'button',
 'behind',
 'neck',
 'top',
 'unfortunately',
 'return',
 'item',
 'really',
 'wanted',
 'work',
 'ala',
 'strange',
 'fit',
 'strap',
 'would',
 'stay',
 'weird',
 'fit',
 'breast',
 'worked',
 'standing',
 'minute',
 'sat',
 'fell',
 'shoulder',
 'fabric',
 'beautiful',
 'loved',
 'pocket',
 'designer',
 'keep',
 'making',
 'crop',
 'top',
 'cant',
 'imagine',
 'would',
 'flattering',
 'anyone',
 'especially',
 'someone',
 'average',
 'height',
 'well',
 'endowed',
 'top',
 'looked',
 'like',
 'football',
 'player',
 'pattern',
 'fabric',
 'gorgeous',
 'like',
 'tall',
 'super',
 'tiny',
 'fit',
 'xx',
 'may',
 'work',
 'fit',
 'small',
 'huge',
 'almost',
 'bigger',
 'bottom',
 'top',
 'weird',
 'cut',
 'cami',
 'underneath',
 'ride',
 'show',
 'zipper',
 'broke',
 'piece',
 'first',
 'time',
 'wore',
 'disappointing',
 'since',
 'love',
 'design',
 'im',
 'actually',
 'going',
 'try',
 'replace',
 'zipper',
 'something',
 'stronger',
 'annoying',
 'come',
 'really',
 'hoping',
 'like',
 'look',
 'way',
 'model',
 'least',
 'hem',
 'much',
 'pronounced',
 'looser',
 'one',
 'photo',
 'look',
 'like',
 'pinned',
 'back',
 'usually',
 'wear',
 'medium',
 'large',
 'got',
 'medium',
 'lot',
 'material',
 'bottom',
 'half',
 'photo',
 'show',
 'made',
 'look',
 'bigger',
 'flattering',
 'material',
 'nice',
 'weave',
 'thin',
 'delicate',
 'bought',
 'holly',
 'deep',
 'olive',
 'blue',
 'color',
 'really',
 'cute',
 'piece',
 'huge',
 'ordered',
 'xx',
 'petite',
 'unfortunately',
 'extremely',
 'wide',
 'flattering',
 'returning',
 'usually',
 'wear',
 'medium',
 'bought',
 'small',
 'fit',
 'ok',
 'shape',
 'flattering',
 'love',
 'baby',
 'doll',
 'dress',
 'top',
 'tent',
 'daughter',
 'saw',
 'try',
 'said',
 'thats',
 'piece',
 'tablecloth',
 'going',
 'back',
 'excited',
 'order',
 'top',
 'red',
 'x',
 'cute',
 'huge',
 'shapeless',
 'support',
 'thin',
 'go',
 'back',
 'shouldve',
 'looked',
 'review',
 'need',
 'easy',
 'comfortable',
 'top',
 'everyday',
 'wear',
 'bought',
 'top',
 'mostly',
 'cute',
 'button',
 'received',
 'looked',
 'exactly',
 'picture',
 'online',
 'however',
 'button',
 'kept',
 'slipping',
 'home',
 'hole',
 'slightly',
 'big',
 'shirt',
 'fit',
 'tad',
 'snug',
 'near',
 'upper',
 'arm',
 'would',
 'stretch',
 'loosen',
 'throughout',
 'day',
 'definitely',
 'comfortable',
 'shirt',
 'felt',
 'like',
 'pajama',
 'top',
 'going',
 'back',
 'fabric',
 'felt',
 'cheap',
 'didnt',
 'find',
 'flattering',
 'top',
 'reference',
 'wearing',
 'medium',
 'photo',
 'measurement',
 'run',
 'big',
 'looked',
 'unflattering',
 'petite',
 'might',
 'work',
 'someone',
 'taller',
 'thin',
 'poor',
 'quality',
 'especially',
 'price',
 'felt',
 'like',
 'thin',
 'pajama',
 'top',
 'button',
 'terrible',
 'little',
 'shell',
 'button',
 'could',
 'returned',
 'faster',
 'dress',
 'quite',
 'flattering',
 'flirty',
 'feminine',
 'way',
 'dress',
 'received',
 'new',
 'color',
 'faded',
 'washed',
 'red',
 'black',
 'stain',
 'belt',
 'area',
 'tag',
 'fabric',
 'look',
 'droopy',
 'laundered',
 'crisp',
 'stiff',
 'new',
 'disappointed',
 'quality',
 'item',
 'received',
 'one',
 'going',
 'back',
 'dear',
 'retailer',
 'please',
 'make',
 'sure',
 'send',
 'clothing',
 'article',
 'first',
 'fabric',
 'beautiful',
 'lovely',
 'spring',
 'summer',
 'really',
 'wanted',
 'like',
 'top',
 'fitting',
 'awkward',
 'typically',
 'sized',
 'shirt',
 'size',
 'tight',
 'pulled',
 'funny',
 'across',
 'chest',
 'size',
 'also',
 'found',
 'cut',
 'shoulder',
 'narrow',
 'need',
 'strapless',
 'bra',
 'made',
 'look',
 'unflattering',
 'overall',
 'return',
 'one',
 'back',
 'store',
 'excited',
 'try',
 'top',
 'since',
 'bargain',
 'neutral',
 'unfortunately',
 'shape',
 'line',
 'accentuates',
 'hip',
 'area',
 'bit',
 'find',
 'flattering',
 'returned',
 'tried',
 'first',
 'comment',
 'take',
 'scratchy',
 'didnt',
 'bother',
 'trying',
 'however',
 'beautiful',
 'sensitive',
 'scratchy',
 'material',
 'fit',
 'true',
 'size',
 'keeping',
 'one',
 'fabric',
 'bit',
 'person',
 'cut',
 'odd',
 'fit',
 'fine',
 'snap',
 'keep',
 'neckline',
 'flat',
 'shaped',
 'color',
 'shown',
 'good',
 'length',
 'fall',
 'top',
 'hip',
 'simply',
 'like',
 'metallic',
 'looking',
 'maybe',
 'disappointed',
 'mention',
 'suit',
 'australian',
 'size',
 'ordered',
 'im',
 'usually',
 'sold',
 'suit',
 'arrives',
 'doesnt',
 'even',
 'fir',
 'hip',
 'label',
 'clearly',
 'say',
 'australian',
 'u',
 'return',
 'suit',
 'sizing',
 'issue',
 'suit',
 'look',
 'well',
 'made',
 'design',
 'adorable',
 'title',
 'say',
 'fabric',
 'top',
 'best',
 'worst',
 'part',
 'design',
 'color',
 'vibrant',
 'combination',
 'material',
 'shoulder',
 'knit',
 'sweaterlike',
 'navy',
 'fabric',
 'interesting',
 'however',
 'positive',
 'comment',
 'end',
 'one',
 'top',
 'scratchy',
 'stiff',
 'frankly',
 'uncomfortable',
 'imagine',
 'wanting',
 'wear',
 'could',
 'benefit',
 'lining',
 'might',
 'solved',
 'problem',
 'scratchy',
 'itchy',
 'fabric',
 'stiff',
 'fabric',
 'bo',
 'color',
 'vivid',
 'perfectly',
 'fit',
 'mess',
 'overall',
 'large',
 'waistline',
 'curve',
 'front',
 'fall',
 'small',
 'pleat',
 'maternityish',
 'waistband',
 'thicker',
 'dress',
 'sat',
 'away',
 'body',
 'material',
 'poly',
 'outer',
 'dress',
 'made',
 'material',
 'lining',
 'would',
 'liked',
 'better',
 'modesty',
 'closure',
 'plus',
 'dress',
 'already',
 'unraveling',
 'took',
 'returned',
 'online',
 'look',
 'like',
 'great',
 'sweater',
 'ordered',
 'xxsp',
 'found',
 'sweater',
 'much',
 'wider',
 'middle',
 'pictured',
 'fact',
 'im',
 'pretty',
 'sure',
 'pinned',
 'shirt',
 'back',
 'picture',
 'make',
 'appear',
 'slimmer',
 'unfortunately',
 'sweater',
 'work',
 'hourglass',
 'shape',
 'shirt',
 'make',
 'look',
 'pound',
 'heavier',
 'worried',
 'item',
 'ordered',
 'look',
 'picture',
 'thinking',
 'gone',
 'gut',
 'shirt',
 'quality',
 'retailer',
 'purchase',
 'seethrough',
 'flimsy',
 'bottom',
 'like',
 'picture',
 'hang',
 'odd',
 'rumpled',
 'way',
 'top',
 'flattering',
 'though',
 'shame',
 'bottom',
 'fit',
 'nicer',
 'like',
 'product',
 'could',
 'gotten',
 'away',
 'wearing',
 'cami',
 'make',
 'qu',
 'torn',
 'whether',
 'return',
 'ultimately',
 'going',
 'back',
 'knit',
 'thin',
 'thought',
 'would',
 'cozy',
 'normal',
 'sweater',
 'weight',
 'light',
 'swing',
 'effect',
 'doesnt',
 'really',
 'come',
 'nothing',
 'special',
 ...]
In [105]:
pos_words =" ".join(df[df["recommended_ind"] == 1].review_text).split()
pos_words
Out[105]:
['absolutely',
 'wonderful',
 'silky',
 'sexy',
 'comfortable',
 'love',
 'dress',
 'sooo',
 'pretty',
 'happened',
 'find',
 'store',
 'im',
 'glad',
 'bc',
 'never',
 'would',
 'ordered',
 'online',
 'bc',
 'petite',
 'bought',
 'petite',
 'love',
 'length',
 'hit',
 'little',
 'knee',
 'would',
 'definitely',
 'true',
 'midi',
 'someone',
 'truly',
 'petite',
 'love',
 'love',
 'love',
 'jumpsuit',
 'fun',
 'flirty',
 'fabulous',
 'every',
 'time',
 'wear',
 'get',
 'nothing',
 'great',
 'compliment',
 'shirt',
 'flattering',
 'due',
 'adjustable',
 'front',
 'tie',
 'perfect',
 'length',
 'wear',
 'legging',
 'sleeveless',
 'pair',
 'well',
 'cardigan',
 'love',
 'shirt',
 'basket',
 'hte',
 'last',
 'see',
 'would',
 'look',
 'like',
 'person',
 'store',
 'pick',
 'went',
 'teh',
 'color',
 'pale',
 'hte',
 'color',
 'really',
 'gorgeous',
 'turn',
 'trying',
 'little',
 'baggy',
 'hte',
 'x',
 'hte',
 'size',
 'bummer',
 'petite',
 'decided',
 'though',
 'said',
 'everything',
 'pant',
 'skirt',
 'trying',
 'kept',
 'oops',
 'ordered',
 'carbon',
 'store',
 'pick',
 'ton',
 'stuff',
 'always',
 'try',
 'used',
 'top',
 'pair',
 'skirt',
 'pant',
 'everything',
 'went',
 'color',
 'really',
 'nice',
 'charcoal',
 'shimmer',
 'went',
 'well',
 'pencil',
 'skirt',
 'flare',
 'pant',
 'etc',
 'bit',
 'big',
 'sleeve',
 'long',
 'doesnt',
 'go',
 'petite',
 'also',
 'bit',
 'loose',
 'xx',
 'kept',
 'wil',
 'later',
 'since',
 'light',
 'color',
 'already',
 'sold',
 'hte',
 'smallest',
 'size',
 'love',
 'dress',
 'usually',
 'get',
 'x',
 'run',
 'little',
 'snug',
 'bust',
 'ordered',
 'size',
 'flattering',
 'feminine',
 'usual',
 'retailer',
 'flair',
 'style',
 'im',
 'lb',
 'ordered',
 'petite',
 'make',
 'sure',
 'length',
 'wasnt',
 'long',
 'typically',
 'wear',
 'x',
 'regular',
 'retailer',
 'dress',
 'youre',
 'le',
 'busty',
 'cup',
 'smaller',
 'petite',
 'fit',
 'perfectly',
 'snug',
 'tight',
 'love',
 'could',
 'dress',
 'party',
 'work',
 'love',
 'tulle',
 'longer',
 'fabric',
 'underneath',
 'dress',
 'perfection',
 'pretty',
 'flattering',
 'find',
 'review',
 'written',
 'savvy',
 'shopper',
 'past',
 'right',
 'product',
 'case',
 'doubt',
 'would',
 'even',
 'tried',
 'dress',
 'beautifully',
 'made',
 'lined',
 'reminiscent',
 'old',
 'retailer',
 'quality',
 'lined',
 'solid',
 'fabric',
 'match',
 'outer',
 'fabric',
 'print',
 'tt',
 'formfitting',
 'fall',
 'knee',
 'rid',
 'bought',
 'black',
 'x',
 'go',
 'midi',
 'dress',
 'didnt',
 'bother',
 'lining',
 'skirt',
 'portion',
 'stats',
 'x',
 'fit',
 'smoothly',
 'around',
 'chest',
 'flowy',
 'around',
 'lower',
 'half',
 'would',
 'say',
 'running',
 'big',
 'strap',
 'pretty',
 'could',
 'easily',
 'im',
 'came',
 'knee',
 'nice',
 'choice',
 'holiday',
 'gathering',
 'like',
 'length',
 'knee',
 'conservative',
 'enough',
 'office',
 'related',
 'gathering',
 'size',
 'small',
 'fit',
 'well',
 'usually',
 'size',
 'small',
 'bust',
 'opinion',
 'run',
 'small',
 'larger',
 'bust',
 'definitely',
 'size',
 'perhaps',
 'waist',
 'big',
 'problem',
 'dress',
 'quality',
 'fabric',
 'terrible',
 'delicate',
 'netting',
 'type',
 'fabric',
 'top',
 'layer',
 'skirt',
 'got',
 'stuck',
 'zip',
 'took',
 'package',
 'wanted',
 'fit',
 'badly',
 'could',
 'tell',
 'put',
 'wouldnt',
 'hourglass',
 'figure',
 'straight',
 'waist',
 'way',
 'small',
 'body',
 'shape',
 'even',
 'sized',
 'could',
 'tell',
 'would',
 'still',
 'tight',
 'waist',
 'roomy',
 'hip',
 'said',
 'really',
 'nice',
 'sturdy',
 'linenlike',
 'fabric',
 'pretty',
 'color',
 'well',
 'made',
 'hope',
 'make',
 'someone',
 'happy',
 'material',
 'color',
 'nice',
 'leg',
 'opening',
 'large',
 'length',
 'hit',
 'right',
 'ankle',
 'leg',
 'opening',
 'size',
 'waist',
 'hem',
 'line',
 'ankle',
 'front',
 'pleat',
 'make',
 'fluffy',
 'think',
 'imagine',
 'flattering',
 'look',
 'least',
 'average',
 'height',
 'taller',
 'may',
 'look',
 'good',
 'took',
 'chance',
 'blouse',
 'glad',
 'wasnt',
 'crazy',
 'blouse',
 'photographed',
 'model',
 'paired',
 'whit',
 'white',
 'pant',
 'worked',
 'perfectly',
 'crisp',
 'clean',
 'would',
 'describe',
 'launders',
 'well',
 'fit',
 'great',
 'drape',
 'perfect',
 'wear',
 'tucked',
 'cant',
 'go',
 'wrong',
 'flattering',
 'super',
 'cozy',
 'coat',
 'work',
 'well',
 'cold',
 'dry',
 'day',
 'look',
 'good',
 'jean',
 'dressier',
 'outfit',
 'small',
 'fit',
 'great',
 'love',
 'look',
 'feel',
 'tulle',
 'dress',
 'looking',
 'something',
 'different',
 'top',
 'new',
 'year',
 'eve',
 'im',
 'small',
 'chested',
 'top',
 'dress',
 'form',
 'fitting',
 'flattering',
 'look',
 'steamed',
 'tulle',
 'perfect',
 'ordered',
 'xsp',
 'length',
 'perfect',
 'product',
 'petite',
 'would',
 'get',
 'petite',
 'regular',
 'little',
 'long',
 'tailor',
 'simple',
 'fix',
 'fit',
 'nicely',
 'im',
 'pregnant',
 'bough',
 'medium',
 'grow',
 'tie',
 'front',
 'back',
 'provides',
 'nice',
 'flexibility',
 'form',
 'fitting',
 'im',
 'upset',
 'price',
 'dress',
 'thought',
 'embroidered',
 'print',
 'fabric',
 'think',
 'little',
 'opened',
 'box',
 'still',
 'ver',
 'pretty',
 'would',
 'say',
 'true',
 'size',
 'tad',
 'bit',
 'big',
 'tiny',
 'still',
 'get',
 'away',
 'color',
 'vibrant',
 'style',
 'unique',
 'skirt',
 'portion',
 'pretty',
 'poofy',
 'keep',
 'going',
 'back',
 'forth',
 'mainly',
 'price',
 'although',
 'quality',
 'definitely',
 'except',
 'wish',
 'cute',
 'little',
 'dress',
 'fit',
 'tt',
 'little',
 'high',
 'waisted',
 'good',
 'length',
 'height',
 'like',
 'dress',
 'im',
 'love',
 'dont',
 'think',
 'look',
 'feel',
 'cheap',
 'appears',
 'pictured',
 'love',
 'shirt',
 'first',
 'saw',
 'wasnt',
 'sure',
 'shirt',
 'dress',
 'since',
 'seethrough',
 'wear',
 'like',
 'dress',
 'need',
 'slip',
 'wear',
 'legging',
 'bought',
 'slip',
 'wore',
 'tie',
 'back',
 'white',
 'wedge',
 'could',
 'also',
 'wear',
 'vest',
 'careful',
 'button',
 'havent',
 'fall',
 'yet',
 'feel',
 'like',
 'overall',
 'great',
 'occasion',
 'fun',
 'wear',
 'color',
 'werent',
 'expected',
 'either',
 'dark',
 'blue',
 'much',
 'vibrant',
 'couldnt',
 'find',
 'anything',
 'really',
 'go',
 'fabric',
 'thick',
 'good',
 'quality',
 'nice',
 'weight',
 'movement',
 'skirt',
 'wasnt',
 'end',
 'several',
 'shirt',
 'get',
 'many',
 'compliment',
 'especially',
 'one',
 'say',
 'dont',
 'hesitate',
 'buy',
 'shirt',
 'wont',
 'sorry',
 'sweater',
 'comfy',
 'classic',
 'balance',
 'quirky',
 'handknit',
 'look',
 'beautiful',
 'color',
 'practical',
 'fit',
 'bit',
 'cropped',
 'boxy',
 'part',
 'style',
 'others',
 'mentioned',
 'gap',
 'knit',
 'make',
 'seethrough',
 'opinion',
 'make',
 'perfect',
 'layering',
 'like',
 'longer',
 'camisole',
 'showing',
 'underneath',
 'wearing',
 'little',
 'dress',
 'warm',
 'still',
 'thin',
 'enough',
 'fit',
 'jacket',
 'coat',
 'beautifully',
 'made',
 'pant',
 'trend',
 'flared',
 'crop',
 'much',
 'cuter',
 'person',
 'love',
 'never',
 'would',
 'given',
 'pant',
 'second',
 'look',
 'online',
 'person',
 'much',
 'cuter',
 'stripe',
 'brighter',
 'fit',
 'flattering',
 'crop',
 'cute',
 'flare',
 'right',
 'trend',
 'brand',
 'always',
 'run',
 'small',
 'carry',
 'belly',
 'paired',
 'loose',
 'navy',
 'blazer',
 'pant',
 'even',
 'better',
 'person',
 'downside',
 'need',
 'dry',
 'cleaned',
 'neat',
 'dress',
 'color',
 'great',
 'fabric',
 'super',
 'soft',
 'tall',
 'long',
 'length',
 'added',
 'bonus',
 'definitely',
 'need',
 'something',
 'underneath',
 'since',
 'front',
 'gap',
 'going',
 'pair',
 'funky',
 'tank',
 'top',
 'necklace',
 'boot',
 'super',
 'cute',
 'wouldnt',
 'given',
 'second',
 'look',
 'tried',
 'store',
 'whim',
 'love',
 'love',
 'comfortable',
 'skirt',
 'span',
 'season',
 'easily',
 'exciting',
 'design',
 'good',
 'work',
 'skirt',
 'paired',
 'many',
 'top',
 'ordered',
 'small',
 'size',
 'medium',
 'mom',
 'size',
 'gorgeous',
 'beautifully',
 'draped',
 'ill',
 'need',
 'houston',
 'fall',
 'winter',
 'look',
 'polished',
 'snapped',
 'ageappropriate',
 'mom',
 'look',
 'amazing',
 'skinny',
 'jean',
 'legging',
 'ordered',
 'gray',
 'true',
 'photo',
 'super',
 'cute',
 'comfy',
 'pull',
 'sizing',
 'accurate',
 'material',
 'little',
 'bit',
 'stretch',
 'great',
 'casual',
 'top',
 'flare',
 'look',
 'cute',
 'grey',
 'pilcro',
 'stet',
 'jean',
 'flattering',
 'peplum',
 'back',
 'nice',
 'cut',
 'shoulder',
 'neckline',
 'pretty',
 'unique',
 'great',
 'jean',
 'worn',
 'work',
 'slack',
 'heel',
 'color',
 'print',
 'embroidery',
 'lovely',
 'reasonably',
 'priced',
 'beautiful',
 'top',
 'unique',
 'ordinary',
 'bought',
 'usual',
 'medium',
 'found',
 'fit',
 'tight',
 'across',
 'chest',
 'although',
 'baby',
 'year',
 'nursing',
 'could',
 'bought',
 'would',
 'size',
 'poncho',
 'cute',
 'love',
 'plaid',
 'check',
 'design',
 'color',
 'look',
 'like',
 'sorbet',
 'cream',
 'pair',
 'well',
 'turtleneck',
 'jean',
 'pencil',
 'skirt',
 'heel',
 'love',
 'look',
 'fall',
 'roll',
 'right',
 'spring',
 'great',
 'buy',
 'first',
 'thermal',
 'naturally',
 'didnt',
 'expect',
 'super',
 'sheer',
 'really',
 'sheer',
 'light',
 'fabric',
 'like',
 'prepared',
 'considering',
 'might',
 'run',
 'walk',
 'around',
 'house',
 'second',
 'large',
 'ordered',
 'size',
 'nice',
 'oversized',
 'fit',
 'pound',
 'wouldnt',
 'want',
 'go',
 'larger',
 'along',
 'sheer',
 'fabric',
 'easily',
 'stretched',
 'dont',
 'mind',
 'case',
 'finally',
 'color',
 'look',
 'white',
 'monitor',
 'show',
 'colorful',
 'blue',
 'dot',
 'tried',
 'today',
 'local',
 'retailer',
 'comfortable',
 'flattering',
 'bad',
 'picture',
 'online',
 'model',
 'tucking',
 'skirt',
 'cant',
 'see',
 'ruching',
 'across',
 'front',
 'little',
 'dressier',
 'alternative',
 'plain',
 'tee',
 'reasonably',
 'priced',
 'retailer',
 'generally',
 'wear',
 'small',
 'fit',
 'well',
 'probably',
 'back',
 'black',
 'bought',
 'item',
 'online',
 'fit',
 'model',
 'looked',
 'little',
 'loose',
 'got',
 'mine',
 'seemed',
 'bit',
 'tight',
 'took',
 'back',
 'store',
 'ordered',
 'larger',
 'size',
 'sale',
 'price',
 'great',
 'top',
 'love',
 'top',
 'wear',
 'time',
 'problem',
 'tell',
 'wear',
 'time',
 ...]
In [106]:
review_text = df["review_text"]
In [107]:
all_words = " ".join(review_text)
In [108]:
all_words[:100]
Out[108]:
'absolutely wonderful silky sexy comfortable love dress sooo pretty happened find store im glad bc ne'
In [109]:
from wordcloud import WordCloud 

wordcloud = WordCloud(background_color="white", max_words =250).generate(all_words)

plt.figure(figsize = (13, 13))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [110]:
wordcloud = WordCloud(background_color="white", max_words =250, colormap='gist_heat').generate(str(neg_words))

plt.figure(figsize = (13, 13))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [111]:
wordcloud = WordCloud(background_color="white", max_words =250, colormap='cool').generate(str(pos_words))

plt.figure(figsize = (13, 13))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [112]:
df.head()
Out[112]:
review_text recommended_ind
0 absolutely wonderful silky sexy comfortable 1
1 love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite 1
2 high hope dress really wanted work initially ordered petite small usual size found small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom half tight l... 0
3 love love love jumpsuit fun flirty fabulous every time wear get nothing great compliment 1
4 shirt flattering due adjustable front tie perfect length wear legging sleeveless pair well cardigan love shirt 1
In [113]:
from sklearn.model_selection import train_test_split
In [114]:
X = df["review_text"]
y= df["recommended_ind"]
In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=101)
In [116]:
from sklearn.feature_extraction.text import CountVectorizer
In [117]:
vectorizer = CountVectorizer()

X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)
In [118]:
type(X_train_count)
Out[118]:
scipy.sparse._csr.csr_matrix
In [119]:
X_train_count.toarray()
Out[119]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
In [121]:
vectorizer.get_feature_names_out()
Out[121]:
array(['aa', 'ab', 'abby', ..., 'zipping', 'zone', 'zoom'], dtype=object)
In [123]:
pd.DataFrame(X_train_count.toarray(), columns = vectorizer.get_feature_names_out())
Out[123]:
aa ab abby abdomen ability able abo abovetheknee absolute absolutely absolutley abstract absurd abt abundance ac accent accented accentuate accentuated accentuates accentuating accept acceptable access accessorize accessorized accessorizing accessory accident accidental accidentally accommodate accommodates accommodating accomodate according accordingly account accurate accurately achieve across acrylic act action active activewear activity actual actuality actually ad ada add added addicted adding addition additional additionally address adequate adequately adjust adjustable adjusted adjusting adjustment admire admired admiring admit admittedly adn ador adorable adore adored adult advantage advertised advice advise advised aesthetic aesthetically affair affect afford affordable afraid afternoon afterward afterwards ag age ageappropriate aged ago ... word wore work workable workappropriate worked working workmanship workout workplace workthe world worn worried worry worrying worse worst worth worthwhile worthy wou woul would wouldnt wouldve wound woven wow wowed wrap wrapped wrapping wri wring wrinkle wrinkled wrinkling wrinkly wrist write writing written wrong wrote xl xlarge xmas xsi xsmall xsmallsmall xsp xspetite xssm xtra xx xxl xxsmall xxsp yarn yay yeah year yearold yearround yellow yellowish yelloworange yellowy yep yes yesterday yet yikes yo yoga yoke york youd youll young younger youre youthful youve yr yuck yucky yummy zag zero zig zigzag zip zipped zipper zippered zipping zone zoom
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18107 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
18108 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
18109 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
18110 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
18111 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

18112 rows × 5791 columns

In [124]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [125]:
tf_idf_vectorizer = TfidfVectorizer()

X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)
In [126]:
X_train_tf_idf.toarray()
Out[126]:
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])
In [128]:
pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names_out())
Out[128]:
aa ab abby abdomen ability able abo abovetheknee absolute absolutely absolutley abstract absurd abt abundance ac accent accented accentuate accentuated accentuates accentuating accept acceptable access accessorize accessorized accessorizing accessory accident accidental accidentally accommodate accommodates accommodating accomodate according accordingly account accurate accurately achieve across acrylic act action active activewear activity actual actuality actually ad ada add added addicted adding addition additional additionally address adequate adequately adjust adjustable adjusted adjusting adjustment admire admired admiring admit admittedly adn ador adorable adore adored adult advantage advertised advice advise advised aesthetic aesthetically affair affect afford affordable afraid afternoon afterward afterwards ag age ageappropriate aged ago ... word wore work workable workappropriate worked working workmanship workout workplace workthe world worn worried worry worrying worse worst worth worthwhile worthy wou woul would wouldnt wouldve wound woven wow wowed wrap wrapped wrapping wri wring wrinkle wrinkled wrinkling wrinkly wrist write writing written wrong wrote xl xlarge xmas xsi xsmall xsmallsmall xsp xspetite xssm xtra xx xxl xxsmall xxsp yarn yay yeah year yearold yearround yellow yellowish yelloworange yellowy yep yes yesterday yet yikes yo yoga yoke york youd youll young younger youre youthful youve yr yuck yucky yummy zag zero zig zigzag zip zipped zipper zippered zipping zone zoom
0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
2 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
3 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
4 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18107 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.18 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
18108 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.11 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
18109 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.29 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.20 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
18110 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
18111 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.23 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00

18112 rows × 5791 columns

In [129]:
from sklearn.metrics import confusion_matrix,classification_report, f1_score, recall_score, accuracy_score, precision_score
In [130]:
def eval(model, X_train, X_test):
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    
    print("Test_Set")
    print(classification_report(y_test, y_pred))
    print("Train_Set")
    print(classification_report(y_train, y_pred_train))
    fig, ax = plt.subplots(figsize=(8, 8))
    
    plot_confusion_matrix(model, X_test, y_test, ax=ax)
In [1]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(C =0.6, max_iter=1000, class_weight= "balanced", random_state=101)
log.fit(X_train_count,y_train)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 4
      1 from sklearn.linear_model import LogisticRegression
      3 log = LogisticRegression(C =0.6, max_iter=1000, class_weight= "balanced", random_state=101)
----> 4 log.fit(X_train_count,y_train)

NameError: name 'X_train_count' is not defined
In [133]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

custom_scorer = {'accuracy': make_scorer(accuracy_score),
                 'precision-0': make_scorer(precision_score, pos_label=0),
                 'recall-0': make_scorer(recall_score, pos_label=0),
                 'f1-0': make_scorer(f1_score, pos_label=0),
                 'precision-1': make_scorer(precision_score, pos_label=1),
                 'recall-1': make_scorer(recall_score, pos_label=1),
                 'f1-1': make_scorer(f1_score, pos_label=1)
                 }

for i, j in custom_scorer.items():
    model = LogisticRegression(C =0.6, max_iter=1000, class_weight= "balanced", random_state=101)
    scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean()
    if i == "recall-1":
        log_count_rec = scores
    elif i == "f1-1":
        log_count_f1 = scores
    print(f" {i:20} score for count : {scores}\n")
 
 accuracy             score for count : 0.8663865231239555

 precision-0          score for count : 0.6004732142128997

 recall-0             score for count : 0.7860432574690488

 f1-0                 score for count : 0.6806600348422279

 precision-1          score for count : 0.9492144073093828

 recall-1             score for count : 0.8841613760989325

 f1-1                 score for count : 0.9155098938458213

In [134]:
from yellowbrick.classifier import PrecisionRecallCurve
viz = PrecisionRecallCurve(
                            LogisticRegression(C =0.6, max_iter=1000, class_weight= "balanced", random_state=101),
                            classes=log.classes_,
                            per_class=True,
                            cmap="Set1"
                           )

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')

viz.fit(X_train_count,y_train)
viz.score(X_test_count, y_test)
viz.show();
In [135]:
log_AP_count = viz.score_
In [136]:
log = LogisticRegression(C=0.1, max_iter=1000, random_state=101, class_weight="balanced")

log.fit(X_train_tf_idf,y_train)
Out[136]:
LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000,
                   random_state=101)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000,
                   random_state=101)
In [137]:
print("LOG MODEL")

eval(log, X_train_tf_idf, X_test_tf_idf)
LOG MODEL
Test_Set
              precision    recall  f1-score   support

           0       0.55      0.86      0.67       820
           1       0.97      0.85      0.90      3709

    accuracy                           0.85      4529
   macro avg       0.76      0.85      0.79      4529
weighted avg       0.89      0.85      0.86      4529

Train_Set
              precision    recall  f1-score   support

           0       0.57      0.89      0.69      3281
           1       0.97      0.85      0.91     14831

    accuracy                           0.86     18112
   macro avg       0.77      0.87      0.80     18112
weighted avg       0.90      0.86      0.87     18112

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[137], line 3
      1 print("LOG MODEL")
----> 3 eval(log, X_train_tf_idf, X_test_tf_idf)

Cell In[130], line 11, in eval(model, X_train, X_test)
      8 print(classification_report(y_train, y_pred_train))
      9 fig, ax = plt.subplots(figsize=(8, 8))
---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax)

NameError: name 'plot_confusion_matrix' is not defined
In [138]:
custom_scorer = {'accuracy': make_scorer(accuracy_score),
                 'precision-0': make_scorer(precision_score, pos_label=0),
                 'recall-0': make_scorer(recall_score, pos_label=0),
                 'f1-0': make_scorer(f1_score, pos_label=0),
                 'precision-1': make_scorer(precision_score, pos_label=1),
                 'recall-1': make_scorer(recall_score, pos_label=1),
                 'f1-1': make_scorer(f1_score, pos_label=1)
                 }

for i, j in custom_scorer.items():
    LogisticRegression(C=0.1, max_iter=1000, random_state=101, class_weight="balanced")
    scores = cross_val_score(model, X_train_tf_idf, y_train, cv = 10, scoring = j).mean()
    if i == "recall-1":
        log_tfidf_rec = scores
    elif i == "f1-1":
        log_tfidf_f1 = scores
    print(f" {i:20} score for tfidf : {scores}\n")
 accuracy             score for tfidf : 0.860257465110808

 precision-0          score for tfidf : 0.5784925099903739

 recall-0             score for tfidf : 0.8463887241456002

 f1-0                 score for tfidf : 0.6870476202382443

 precision-1          score for tfidf : 0.9621600784762144

 recall-1             score for tfidf : 0.8633260510402712

 f1-1                 score for tfidf : 0.9100295215782153

In [139]:
viz = PrecisionRecallCurve(
                            LogisticRegression(C=0.1, max_iter=1000, random_state=101, class_weight="balanced"),
                            classes=log.classes_,
                            per_class=True,
                            cmap="Set1"
                           )

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')

viz.fit(X_train_tf_idf,y_train)
viz.score(X_test_tf_idf, y_test)
viz.show();
In [140]:
log_AP_tfidf = viz.score_
In [141]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB # BernoulliNB for binary model
In [142]:
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
Out[142]:
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
In [143]:
print("NB MODEL")

eval(nb, X_train_count, X_test_count)
NB MODEL
Test_Set
              precision    recall  f1-score   support

           0       0.67      0.74      0.70       820
           1       0.94      0.92      0.93      3709

    accuracy                           0.89      4529
   macro avg       0.80      0.83      0.82      4529
weighted avg       0.89      0.89      0.89      4529

Train_Set
              precision    recall  f1-score   support

           0       0.70      0.81      0.75      3281
           1       0.96      0.92      0.94     14831

    accuracy                           0.90     18112
   macro avg       0.83      0.87      0.85     18112
weighted avg       0.91      0.90      0.91     18112

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[143], line 3
      1 print("NB MODEL")
----> 3 eval(nb, X_train_count, X_test_count)

Cell In[130], line 11, in eval(model, X_train, X_test)
      8 print(classification_report(y_train, y_pred_train))
      9 fig, ax = plt.subplots(figsize=(8, 8))
---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax)

NameError: name 'plot_confusion_matrix' is not defined
In [144]:
custom_scorer = {'accuracy': make_scorer(accuracy_score),
                 'precision-0': make_scorer(precision_score, pos_label=0),
                 'recall-0': make_scorer(recall_score, pos_label=0),
                 'f1-0': make_scorer(f1_score, pos_label=0),
                 'precision-1': make_scorer(precision_score, pos_label=1),
                 'recall-1': make_scorer(recall_score, pos_label=1),
                 'f1-1': make_scorer(f1_score, pos_label=1)
                 }

for i, j in custom_scorer.items():
    model = MultinomialNB()
    scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean()
    if i == "recall-1":
        nb_count_rec = scores
    elif i == "f1-1":
        nb_count_f1 = scores
    print(f" {i:20} score for count : {scores}\n")
 accuracy             score for count : 0.8826189413968841

 precision-0          score for count : 0.654342059232186

 recall-0             score for count : 0.7476425235376973

 f1-0                 score for count : 0.6975979898377522

 precision-1          score for count : 0.9423871525871338

 recall-1             score for count : 0.9124805295596273

 f1-1                 score for count : 0.927165611564892

In [145]:
viz = PrecisionRecallCurve(
                            MultinomialNB(),
                            classes=nb.classes_,
                            per_class=True,
                            cmap="Set1"
                           )

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')

viz.fit(X_train_count,y_train)
viz.score(X_test_count, y_test)
viz.show();
In [146]:
nb_AP_count = viz.score_
In [147]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
nb = MultinomialNB()
nb.fit(X_train_tf_idf, y_train)
Out[147]:
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
In [148]:
print("NB MODEL")
eval(nb, X_train_tf_idf, X_test_tf_idf)
NB MODEL
Test_Set
              precision    recall  f1-score   support

           0       0.87      0.17      0.28       820
           1       0.84      0.99      0.91      3709

    accuracy                           0.84      4529
   macro avg       0.85      0.58      0.60      4529
weighted avg       0.85      0.84      0.80      4529

Train_Set
              precision    recall  f1-score   support

           0       0.95      0.22      0.36      3281
           1       0.85      1.00      0.92     14831

    accuracy                           0.86     18112
   macro avg       0.90      0.61      0.64     18112
weighted avg       0.87      0.86      0.82     18112

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[148], line 2
      1 print("NB MODEL")
----> 2 eval(nb, X_train_tf_idf, X_test_tf_idf)

Cell In[130], line 11, in eval(model, X_train, X_test)
      8 print(classification_report(y_train, y_pred_train))
      9 fig, ax = plt.subplots(figsize=(8, 8))
---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax)

NameError: name 'plot_confusion_matrix' is not defined
In [149]:
custom_scorer = {'accuracy': make_scorer(accuracy_score),
                 'precision-0': make_scorer(precision_score, pos_label=0),
                 'recall-0': make_scorer(recall_score, pos_label=0),
                 'f1-0': make_scorer(f1_score, pos_label=0),
                 'precision-1': make_scorer(precision_score, pos_label=1),
                 'recall-1': make_scorer(recall_score, pos_label=1),
                 'f1-1': make_scorer(f1_score, pos_label=1)
                 }

for i, j in custom_scorer.items():
    model = BernoulliNB()
    scores = cross_val_score(model, X_train_tf_idf, y_train, cv = 10, scoring = j).mean()
    if i == "recall-1":
        nb_tfidf_rec = scores
    elif i == "f1-1":
        nb_tfidf_f1 = scores
    print(f" {i:20} score for tfidf : {scores}\n")
 accuracy             score for tfidf : 0.8766563300312171

 precision-0          score for tfidf : 0.6434549409966672

 recall-0             score for tfidf : 0.7168618874638594

 f1-0                 score for tfidf : 0.6778544585841554

 precision-1          score for tfidf : 0.9357830256315294

 recall-1             score for tfidf : 0.9120087405692183

 f1-1                 score for tfidf : 0.9237148436630017

In [150]:
from yellowbrick.classifier import PrecisionRecallCurve

viz = PrecisionRecallCurve(
                            MultinomialNB(),
                            classes=nb.classes_,
                            per_class=True,
                            cmap="Set1"
                           )

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')

viz.fit(X_train_tf_idf, y_train)
viz.score(X_test_tf_idf, y_test)
viz.show();
In [151]:
nb_AP_tfidf = viz.score_
In [152]:
from sklearn.svm import LinearSVC

svc = LinearSVC(C=0.01, class_weight="balanced", random_state=101)
svc.fit(X_train_count,y_train)
Out[152]:
LinearSVC(C=0.01, class_weight='balanced', random_state=101)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearSVC(C=0.01, class_weight='balanced', random_state=101)
In [153]:
print("SVC MODEL")

eval(svc, X_train_count, X_test_count)
SVC MODEL
Test_Set
              precision    recall  f1-score   support

           0       0.60      0.84      0.70       820
           1       0.96      0.87      0.92      3709

    accuracy                           0.87      4529
   macro avg       0.78      0.86      0.81      4529
weighted avg       0.90      0.87      0.88      4529

Train_Set
              precision    recall  f1-score   support

           0       0.65      0.92      0.76      3281
           1       0.98      0.89      0.93     14831

    accuracy                           0.90     18112
   macro avg       0.82      0.91      0.85     18112
weighted avg       0.92      0.90      0.90     18112

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[153], line 3
      1 print("SVC MODEL")
----> 3 eval(svc, X_train_count, X_test_count)

Cell In[130], line 11, in eval(model, X_train, X_test)
      8 print(classification_report(y_train, y_pred_train))
      9 fig, ax = plt.subplots(figsize=(8, 8))
---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax)

NameError: name 'plot_confusion_matrix' is not defined
In [154]:
custom_scorer = {'accuracy': make_scorer(accuracy_score),
                 'precision-0': make_scorer(precision_score, pos_label=0),
                 'recall-0': make_scorer(recall_score, pos_label=0),
                 'f1-0': make_scorer(f1_score, pos_label=0),
                 'precision-1': make_scorer(precision_score, pos_label=1),
                 'recall-1': make_scorer(recall_score, pos_label=1),
                 'f1-1': make_scorer(f1_score, pos_label=1)
                 }

for i, j in custom_scorer.items():
    model = LinearSVC(C=0.01, class_weight="balanced", random_state=101)
    scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean()
    if i == "recall-1":
        svc_count_rec = scores
    elif i == "f1-1":
        svc_count_f1 = scores
    print(f" {i:20} score for count : {scores}\n")
 accuracy             score for count : 0.8649509741181862

 precision-0          score for count : 0.590799663472119

 recall-0             score for count : 0.8305396990140114

 f1-0                 score for count : 0.6902787096697695

 precision-1          score for count : 0.9588319718226306

 recall-1             score for count : 0.872563900304075

 f1-1                 score for count : 0.91363862996587

In [155]:
viz = PrecisionRecallCurve(
                            LinearSVC(C=0.01, class_weight="balanced", random_state=101),
                            classes=svc.classes_,
                            per_class=True,
                            cmap="Set1"
                           )

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')

viz.fit(X_train_count,y_train)
viz.score(X_test_count, y_test)
viz.show();
In [156]:
svc_AP_count = viz.score_
In [157]:
svc = LinearSVC(C=0.01, class_weight="balanced", random_state=101)

svc.fit(X_train_tf_idf, y_train)
Out[157]:
LinearSVC(C=0.01, class_weight='balanced', random_state=101)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearSVC(C=0.01, class_weight='balanced', random_state=101)
In [158]:
print("SVC MODEL")

eval(svc, X_train_tf_idf, X_test_tf_idf)
SVC MODEL
Test_Set
              precision    recall  f1-score   support

           0       0.55      0.87      0.67       820
           1       0.97      0.84      0.90      3709

    accuracy                           0.85      4529
   macro avg       0.76      0.86      0.79      4529
weighted avg       0.89      0.85      0.86      4529

Train_Set
              precision    recall  f1-score   support

           0       0.57      0.89      0.69      3281
           1       0.97      0.85      0.91     14831

    accuracy                           0.86     18112
   macro avg       0.77      0.87      0.80     18112
weighted avg       0.90      0.86      0.87     18112

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[158], line 3
      1 print("SVC MODEL")
----> 3 eval(svc, X_train_tf_idf, X_test_tf_idf)

Cell In[130], line 11, in eval(model, X_train, X_test)
      8 print(classification_report(y_train, y_pred_train))
      9 fig, ax = plt.subplots(figsize=(8, 8))
---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax)

NameError: name 'plot_confusion_matrix' is not defined
In [159]:
custom_scorer = {'accuracy': make_scorer(accuracy_score),
                 'precision-0': make_scorer(precision_score, pos_label=0),
                 'recall-0': make_scorer(recall_score, pos_label=0),
                 'f1-0': make_scorer(f1_score, pos_label=0),
                 'precision-1': make_scorer(precision_score, pos_label=1),
                 'recall-1': make_scorer(recall_score, pos_label=1),
                 'f1-1': make_scorer(f1_score, pos_label=1)
                 }

for i, j in custom_scorer.items():
    model = LinearSVC(C=0.01, class_weight="balanced", random_state=101)
    scores = cross_val_score(model, X_train_tf_idf, y_train, cv = 10, scoring = j).mean()
    if i == "recall-1":
        svc_tfidf_rec = scores
    elif i == "f1-1":
        svc_tfidf_f1 = scores
    print(f" {i:20} score for tfidf : {scores}\n")
 accuracy             score for tfidf : 0.8430870703074053

 precision-0          score for tfidf : 0.5428195619628373

 recall-0             score for tfidf : 0.8576673585884796

 f1-0                 score for tfidf : 0.6646097767647368

 precision-1          score for tfidf : 0.9639062389709894

 recall-1             score for tfidf : 0.8398624664435934

 f1-1                 score for tfidf : 0.8975617321619218

In [160]:
viz = PrecisionRecallCurve(
                            LinearSVC(C=0.01, class_weight="balanced", random_state=101),
                            classes=svc.classes_,
                            per_class=True,
                            cmap="Set1"
                           )

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')

viz.fit(X_train_tf_idf,y_train)
viz.score(X_test_tf_idf, y_test)
viz.show();
In [161]:
svc_AP_tfidf = viz.score_
In [162]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(200, max_depth = 12, random_state = 42, n_jobs = -1, class_weight="balanced")
rf.fit(X_train_count, y_train)
Out[162]:
RandomForestClassifier(class_weight='balanced', max_depth=12, n_estimators=200,
                       n_jobs=-1, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(class_weight='balanced', max_depth=12, n_estimators=200,
                       n_jobs=-1, random_state=42)
In [163]:
print("RF MODEL")

eval(rf, X_train_count, X_test_count)
RF MODEL
Test_Set
              precision    recall  f1-score   support

           0       0.57      0.78      0.66       820
           1       0.95      0.87      0.91      3709

    accuracy                           0.85      4529
   macro avg       0.76      0.83      0.78      4529
weighted avg       0.88      0.85      0.86      4529

Train_Set
              precision    recall  f1-score   support

           0       0.63      0.90      0.74      3281
           1       0.97      0.88      0.93     14831

    accuracy                           0.89     18112
   macro avg       0.80      0.89      0.83     18112
weighted avg       0.91      0.89      0.89     18112

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[163], line 3
      1 print("RF MODEL")
----> 3 eval(rf, X_train_count, X_test_count)

Cell In[130], line 11, in eval(model, X_train, X_test)
      8 print(classification_report(y_train, y_pred_train))
      9 fig, ax = plt.subplots(figsize=(8, 8))
---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax)

NameError: name 'plot_confusion_matrix' is not defined
In [164]:
custom_scorer = {'accuracy': make_scorer(accuracy_score),
                 'precision-0': make_scorer(precision_score, pos_label=0),
                 'recall-0': make_scorer(recall_score, pos_label=0),
                 'f1-0': make_scorer(f1_score, pos_label=0),
                 'precision-1': make_scorer(precision_score, pos_label=1),
                 'recall-1': make_scorer(recall_score, pos_label=1),
                 'f1-1': make_scorer(f1_score, pos_label=1)
                 }

for i, j in custom_scorer.items():
    model = RandomForestClassifier(200, max_depth = 12, random_state = 42, n_jobs = -1, class_weight="balanced")
    scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean()
    if i == "recall-1":
        rf_count_rec = scores
    elif i == "f1-1":
        rf_count_f1 = scores
    print(f" {i:20} score for count : {scores}\n")
 accuracy             score for count : 0.8500434248393738

 precision-0          score for count : 0.5623773261675364

 recall-0             score for count : 0.7866502335236119

 f1-0                 score for count : 0.6554505796690441

 precision-1          score for count : 0.9482590583977227

 recall-1             score for count : 0.8640680633886655

 f1-1                 score for count : 0.9041336198854516

In [165]:
viz = PrecisionRecallCurve(
                            RandomForestClassifier(200, max_depth = 10, random_state = 42, n_jobs = -1, class_weight="balanced"),
                            classes=rf.classes_,
                            per_class=True,
                            cmap="Set1"
                           )

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')

viz.fit(X_train_tf_idf,y_train)
viz.score(X_test_tf_idf, y_test)
viz.show();
In [166]:
rf_AP_tfidf = viz.score_
In [167]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators= 500, random_state = 42)
ada.fit(X_train_count, y_train)
Out[167]:
AdaBoostClassifier(n_estimators=500, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
AdaBoostClassifier(n_estimators=500, random_state=42)
In [168]:
print("Ada MODEL")

eval(ada, X_train_count, X_test_count)
Ada MODEL
Test_Set
              precision    recall  f1-score   support

           0       0.73      0.59      0.65       820
           1       0.91      0.95      0.93      3709

    accuracy                           0.89      4529
   macro avg       0.82      0.77      0.79      4529
weighted avg       0.88      0.89      0.88      4529

Train_Set
              precision    recall  f1-score   support

           0       0.80      0.66      0.72      3281
           1       0.93      0.96      0.94     14831

    accuracy                           0.91     18112
   macro avg       0.86      0.81      0.83     18112
weighted avg       0.90      0.91      0.90     18112

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[168], line 3
      1 print("Ada MODEL")
----> 3 eval(ada, X_train_count, X_test_count)

Cell In[130], line 11, in eval(model, X_train, X_test)
      8 print(classification_report(y_train, y_pred_train))
      9 fig, ax = plt.subplots(figsize=(8, 8))
---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax)

NameError: name 'plot_confusion_matrix' is not defined
In [14]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer



custom_scorer = {'accuracy': make_scorer(accuracy_score),
                 'precision-0': make_scorer(precision_score, pos_label=0),
                 'recall-0': make_scorer(recall_score, pos_label=0),
                 'f1-0': make_scorer(f1_score, pos_label=0),
                 'precision-1': make_scorer(precision_score, pos_label=1),
                 'recall-1': make_scorer(recall_score, pos_label=1),
                 'f1-1': make_scorer(f1_score, pos_label=1)
                 }

for i, j in custom_scorer.items():
    model = AdaBoostClassifier(n_estimators= 500, random_state = 42)
    scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean()
    if i == "recall-1":
        ada_count_rec = scores
    elif i == "f1-1":
        ada_count_f1 = scores
    print(f" {i:20} score for count : {scores}\n")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 23
     21 for i, j in custom_scorer.items():
     22     model = AdaBoostClassifier(n_estimators= 500, random_state = 42)
---> 23     scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean()
     24     if i == "recall-1":
     25         ada_count_rec = scores

NameError: name 'X_train_count' is not defined
In [15]:
from sklearn.metrics import PrecisionRecallCurve
viz = PrecisionRecallCurve(
                            AdaBoostClassifier(n_estimators= 500, random_state = 42),
                            classes=ada.classes_,
                            per_class=True,
                            cmap="Set1"
                           )

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')

viz.fit(X_train_count,y_train)
viz.score(X_test_count, y_test)
viz.show();
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[15], line 1
----> 1 from sklearn.metrics import PrecisionRecallCurve
      2 viz = PrecisionRecallCurve(
      3                             AdaBoostClassifier(n_estimators= 500, random_state = 42),
      4                             classes=ada.classes_,
      5                             per_class=True,
      6                             cmap="Set1"
      7                            )
      9 fig, ax = plt.subplots(figsize=(10, 6))

ImportError: cannot import name 'PrecisionRecallCurve' from 'sklearn.metrics' (C:\ProgramData\anaconda3\Lib\site-packages\sklearn\metrics\__init__.py)
In [16]:
ada_AP_count = viz.score_
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 1
----> 1 ada_AP_count = viz.score_

NameError: name 'viz' is not defined
In [17]:
ada = AdaBoostClassifier(n_estimators= 500, random_state = 42)

ada.fit(X_train_tf_idf, y_train)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[17], line 3
      1 ada = AdaBoostClassifier(n_estimators= 500, random_state = 42)
----> 3 ada.fit(X_train_tf_idf, y_train)

NameError: name 'X_train_tf_idf' is not defined
In [18]:
print("Ada MODEL")

eval(ada, X_train_tf_idf, X_test_tf_idf)
Ada MODEL
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 3
      1 print("Ada MODEL")
----> 3 eval(ada, X_train_tf_idf, X_test_tf_idf)

NameError: name 'X_train_tf_idf' is not defined
In [19]:
custom_scorer = {'accuracy': make_scorer(accuracy_score),
                 'precision-0': make_scorer(precision_score, pos_label=0),
                 'recall-0': make_scorer(recall_score, pos_label=0),
                 'f1-0': make_scorer(f1_score, pos_label=0),
                 'precision-1': make_scorer(precision_score, pos_label=1),
                 'recall-1': make_scorer(recall_score, pos_label=1),
                 'f1-1': make_scorer(f1_score, pos_label=1)
                 }

for i, j in custom_scorer.items():
    model =AdaBoostClassifier(n_estimators= 500, random_state = 42)
    scores = cross_val_score(model, X_train_tf_idf, y_train, cv = 10, scoring = j).mean()
    if i == "recall-1":
        ada_tfidf_rec = scores
    elif i == "f1-1":
        ada_tfidf_f1 = scores
    print(f" {i:20} score for tfidf : {scores}\n")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[19], line 12
     10 for i, j in custom_scorer.items():
     11     model =AdaBoostClassifier(n_estimators= 500, random_state = 42)
---> 12     scores = cross_val_score(model, X_train_tf_idf, y_train, cv = 10, scoring = j).mean()
     13     if i == "recall-1":
     14         ada_tfidf_rec = scores

NameError: name 'X_train_tf_idf' is not defined
In [20]:
viz = PrecisionRecallCurve(
                            AdaBoostClassifier(n_estimators= 500, random_state = 42),
                            classes=ada.classes_,
                            per_class=True,
                            cmap="Set1"
                           )

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')

viz.fit(X_train_tf_idf,y_train)
viz.score(X_test_tf_idf, y_test)
viz.show();
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[20], line 1
----> 1 viz = PrecisionRecallCurve(
      2                             AdaBoostClassifier(n_estimators= 500, random_state = 42),
      3                             classes=ada.classes_,
      4                             per_class=True,
      5                             cmap="Set1"
      6                            )
      8 fig, ax = plt.subplots(figsize=(10, 6))
      9 ax.set_facecolor('yellow')

NameError: name 'PrecisionRecallCurve' is not defined
In [21]:
ada_AP_tfidf = viz.score_
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[21], line 1
----> 1 ada_AP_tfidf = viz.score_

NameError: name 'viz' is not defined
In [22]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[22], line 3
      1 import numpy as np
      2 import pandas as pd
----> 3 from tensorflow.keras.models import Sequential
      4 from tensorflow.keras.layers import Dense, GRU, Embedding
      5 from tensorflow.keras.optimizers import Adam

ModuleNotFoundError: No module named 'tensorflow'
In [25]:
os.chdir('C:\\Users\\aksha\\OneDrive\\Desktop\\Placement\\Self Project\\NLP')
df0=pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df = df0.copy()
df.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[25], line 1
----> 1 os.chdir('C:\\Users\\aksha\\OneDrive\\Desktop\\Placement\\Self Project\\NLP')
      2 df0=pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
      3 df = df0.copy()

NameError: name 'os' is not defined
In [26]:
df_dl = df_dl[["Review Text","Recommended IND"]]
df_dl.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[26], line 1
----> 1 df_dl = df_dl[["Review Text","Recommended IND"]]
      2 df_dl.head()

NameError: name 'df_dl' is not defined
In [27]:
df_dl.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[27], line 1
----> 1 df_dl.shape

NameError: name 'df_dl' is not defined
In [28]:
df_dl.dropna(inplace = True)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[28], line 1
----> 1 df_dl.dropna(inplace = True)

NameError: name 'df_dl' is not defined
In [29]:
df_dl.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[29], line 1
----> 1 df_dl.shape

NameError: name 'df_dl' is not defined
In [30]:
X = df_dl['Review Text'].values
y = df_dl['Recommended IND'].values
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[30], line 1
----> 1 X = df_dl['Review Text'].values
      2 y = df_dl['Recommended IND'].values

NameError: name 'df_dl' is not defined
In [31]:
tokenizer.fit_on_texts(X)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[31], line 1
----> 1 tokenizer.fit_on_texts(X)

NameError: name 'tokenizer' is not defined
In [32]:
tokenizer.word_index
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[32], line 1
----> 1 tokenizer.word_index

NameError: name 'tokenizer' is not defined
In [33]:
len(tokenizer.word_index) 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[33], line 1
----> 1 len(tokenizer.word_index)

NameError: name 'tokenizer' is not defined
In [34]:
X_num_tokens = tokenizer.texts_to_sequences(X)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[34], line 1
----> 1 X_num_tokens = tokenizer.texts_to_sequences(X)

NameError: name 'tokenizer' is not defined
In [35]:
num_tokens = [len(tokens) for tokens in X_num_tokens]
num_tokens = np.array(num_tokens)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[35], line 1
----> 1 num_tokens = [len(tokens) for tokens in X_num_tokens]
      2 num_tokens = np.array(num_tokens)

NameError: name 'X_num_tokens' is not defined
In [36]:
np.array(X_num_tokens)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[36], line 1
----> 1 np.array(X_num_tokens)

NameError: name 'X_num_tokens' is not defined
In [37]:
X[105]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[37], line 1
----> 1 X[105]

NameError: name 'X' is not defined
In [38]:
print(X_num_tokens[105])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[38], line 1
----> 1 print(X_num_tokens[105])

NameError: name 'X_num_tokens' is not defined
In [39]:
tokenizer.word_index["shirt"]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[39], line 1
----> 1 tokenizer.word_index["shirt"]

NameError: name 'tokenizer' is not defined
In [40]:
tokenizer.word_index["exactly"]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[40], line 1
----> 1 tokenizer.word_index["exactly"]

NameError: name 'tokenizer' is not defined
In [41]:
num_tokens.mean()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[41], line 1
----> 1 num_tokens.mean()

NameError: name 'num_tokens' is not defined
In [42]:
num_tokens.max() 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[42], line 1
----> 1 num_tokens.max()

NameError: name 'num_tokens' is not defined
In [43]:
num_tokens.argmax()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[43], line 1
----> 1 num_tokens.argmax()

NameError: name 'num_tokens' is not defined
In [44]:
X[16263]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[44], line 1
----> 1 X[16263]

NameError: name 'X' is not defined
In [45]:
len(X[16263])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[45], line 1
----> 1 len(X[16263])

NameError: name 'X' is not defined
In [46]:
num_tokens.argmin()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[46], line 1
----> 1 num_tokens.argmin()

NameError: name 'num_tokens' is not defined
In [47]:
X[820]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[47], line 1
----> 1 X[820]

NameError: name 'X' is not defined
In [48]:
len(X[820])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[48], line 1
----> 1 len(X[820])

NameError: name 'X' is not defined
In [49]:
len(X_num_tokens[105])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[49], line 1
----> 1 len(X_num_tokens[105])

NameError: name 'X_num_tokens' is not defined
In [50]:
np.array(X_num_tokens[105])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[50], line 1
----> 1 np.array(X_num_tokens[105])

NameError: name 'X_num_tokens' is not defined
In [51]:
len(X_num_tokens[106])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[51], line 1
----> 1 len(X_num_tokens[106])

NameError: name 'X_num_tokens' is not defined
In [52]:
np.array(X_num_tokens[106])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[52], line 1
----> 1 np.array(X_num_tokens[106])

NameError: name 'X_num_tokens' is not defined
In [53]:
num_tokens = [len(tokens) for tokens in X_num_tokens]

num_tokens = np.array(num_tokens)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[53], line 1
----> 1 num_tokens = [len(tokens) for tokens in X_num_tokens]
      3 num_tokens = np.array(num_tokens)

NameError: name 'X_num_tokens' is not defined
In [54]:
num_tokens
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[54], line 1
----> 1 num_tokens

NameError: name 'num_tokens' is not defined
In [55]:
max_tokens = 103
In [56]:
sum(num_tokens < max_tokens) / len(num_tokens)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[56], line 1
----> 1 sum(num_tokens < max_tokens) / len(num_tokens)

NameError: name 'num_tokens' is not defined
In [57]:
sum(num_tokens < max_tokens) # the number of documents which have 103 or less tokens
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[57], line 1
----> 1 sum(num_tokens < max_tokens)

NameError: name 'num_tokens' is not defined
In [58]:
len(num_tokens)  # total number of all documents in corpus which is constrained by num_words as 20000
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[58], line 1
----> 1 len(num_tokens)

NameError: name 'num_tokens' is not defined
In [59]:
X_pad = pad_sequences(X_num_tokens, maxlen=max_tokens)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[59], line 1
----> 1 X_pad = pad_sequences(X_num_tokens, maxlen=max_tokens)

NameError: name 'pad_sequences' is not defined
In [60]:
X_pad.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[60], line 1
----> 1 X_pad.shape

NameError: name 'X_pad' is not defined
In [61]:
X_pad[105]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[61], line 1
----> 1 X_pad[105]

NameError: name 'X_pad' is not defined
In [62]:
X_pad[106]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[62], line 1
----> 1 X_pad[106]

NameError: name 'X_pad' is not defined
In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, stratify=y, random_state=101)  

# we have been using stratify to prevent imbalance.
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[63], line 2
      1 from sklearn.model_selection import train_test_split
----> 2 X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, stratify=y, random_state=101)

NameError: name 'X_pad' is not defined
In [64]:
model = Sequential()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[64], line 1
----> 1 model = Sequential()

NameError: name 'Sequential' is not defined
In [65]:
embedding_size = 100
In [66]:
model.add(Embedding(input_dim=num_words,        
                    output_dim=embedding_size,                                       
                    input_length=max_tokens,    
                    name='embedding_layer')) 
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[66], line 1
----> 1 model.add(Embedding(input_dim=num_words,        
      2                     output_dim=embedding_size,                                       
      3                     input_length=max_tokens,    
      4                     name='embedding_layer'))

AttributeError: 'AdaBoostClassifier' object has no attribute 'add'
In [67]:
model.add(GRU(units=48, return_sequences=True))  
model.add(GRU(units=24, return_sequences=True)) 
model.add(GRU(units=12)) 
model.add(Dense(1, activation='sigmoid'))  
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[67], line 1
----> 1 model.add(GRU(units=48, return_sequences=True))  
      2 model.add(GRU(units=24, return_sequences=True)) 
      3 model.add(GRU(units=12)) 

AttributeError: 'AdaBoostClassifier' object has no attribute 'add'
In [68]:
optimizer = Adam(learning_rate=0.006)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[68], line 1
----> 1 optimizer = Adam(learning_rate=0.006)

NameError: name 'Adam' is not defined
In [69]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['Recall'])
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[69], line 1
----> 1 model.compile(loss='binary_crossentropy',
      2               optimizer=optimizer,
      3               metrics=['Recall'])

AttributeError: 'AdaBoostClassifier' object has no attribute 'compile'
In [70]:
model.summary() 
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[70], line 1
----> 1 model.summary()

AttributeError: 'AdaBoostClassifier' object has no attribute 'summary'
In [71]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor="val_loss", mode="auto", 
                           verbose=1, patience = 10, restore_best_weights=True)
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[71], line 1
----> 1 from tensorflow.keras.callbacks import EarlyStopping
      3 early_stop = EarlyStopping(monitor="val_loss", mode="auto", 
      4                            verbose=1, patience = 10, restore_best_weights=True)

ModuleNotFoundError: No module named 'tensorflow'
In [72]:
pd.Series(y_train).value_counts(normalize=True) 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[72], line 1
----> 1 pd.Series(y_train).value_counts(normalize=True)

NameError: name 'y_train' is not defined
In [73]:
weights = {0:82, 1:18}
In [74]:
model.fit(X_train, y_train, epochs=30, batch_size=256, class_weight=weights,
         validation_data=(X_test, y_test), callbacks=[early_stop])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[74], line 1
----> 1 model.fit(X_train, y_train, epochs=30, batch_size=256, class_weight=weights,
      2          validation_data=(X_test, y_test), callbacks=[early_stop])

NameError: name 'X_train' is not defined
In [75]:
model.save('NLP_Sentiment_Analysis_Project')
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[75], line 1
----> 1 model.save('NLP_Sentiment_Analysis_Project')

AttributeError: 'AdaBoostClassifier' object has no attribute 'save'
In [76]:
model_loss = pd.DataFrame(model.history.history)
model_loss.head()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[76], line 1
----> 1 model_loss = pd.DataFrame(model.history.history)
      2 model_loss.head()

AttributeError: 'AdaBoostClassifier' object has no attribute 'history'
In [77]:
model_loss.plot();
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[77], line 1
----> 1 model_loss.plot()

NameError: name 'model_loss' is not defined
In [78]:
model.evaluate(X_train, y_train)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[78], line 1
----> 1 model.evaluate(X_train, y_train)

AttributeError: 'AdaBoostClassifier' object has no attribute 'evaluate'
In [79]:
model.evaluate(X_test, y_test)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[79], line 1
----> 1 model.evaluate(X_test, y_test)

AttributeError: 'AdaBoostClassifier' object has no attribute 'evaluate'
In [80]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score

y_train_pred = (model.predict(X_train) >= 0.5).astype("int32")  

print(confusion_matrix(y_train, y_train_pred))
print("-------------------------------------------------------")
print(classification_report(y_train, y_train_pred))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[80], line 3
      1 from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score
----> 3 y_train_pred = (model.predict(X_train) >= 0.5).astype("int32")  
      5 print(confusion_matrix(y_train, y_train_pred))
      6 print("-------------------------------------------------------")

NameError: name 'X_train' is not defined
In [81]:
y_pred = (model.predict(X_test) >= 0.5).astype("int32")

print(confusion_matrix(y_test, y_pred))
print("-------------------------------------------------------")
print(classification_report(y_test, y_pred))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[81], line 1
----> 1 y_pred = (model.predict(X_test) >= 0.5).astype("int32")
      3 print(confusion_matrix(y_test, y_pred))
      4 print("-------------------------------------------------------")

NameError: name 'X_test' is not defined
In [82]:
from sklearn.metrics import precision_recall_curve, average_precision_score

y_pred_proba = model.predict(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

# plt.plot([1, 0], [0, 1],'k--')
plt.plot(precision, recall)
plt.xlabel('precision')
plt.ylabel('recall')
plt.title('Precision Recall Curve')
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[82], line 3
      1 from sklearn.metrics import precision_recall_curve, average_precision_score
----> 3 y_pred_proba = model.predict(X_test)
      4 precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
      6 # plt.plot([1, 0], [0, 1],'k--')

NameError: name 'X_test' is not defined
In [83]:
from sklearn.metrics import precision_recall_curve, average_precision_score, recall_score

DL_AP = average_precision_score(y_test, y_pred_proba)
DL_f1 = f1_score(y_test, y_pred)
DL_rec = recall_score(y_test, y_pred)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[83], line 3
      1 from sklearn.metrics import precision_recall_curve, average_precision_score, recall_score
----> 3 DL_AP = average_precision_score(y_test, y_pred_proba)
      4 DL_f1 = f1_score(y_test, y_pred)
      5 DL_rec = recall_score(y_test, y_pred)

NameError: name 'y_test' is not defined
In [84]:
review1 = "Love this dress"
review2 = "Absolutely wonderful. silky and sexy and comfortable"
review3 = "i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up!"
review4 = "I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments!"
review5 = 'This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!!'
review6 = 'I love tracy reese dresses, but this one is not for the very petite. i am just under 5 feet tall and usually wear a 0p in this brand. this dress was very pretty out of the package but its a lot of dress.'
review7 = 'I love this dress. i usually get an xs but it runs a little snug in bust so i ordered up a size. very flattering and feminine with the usual retailer flair for style.'
review8 = 'Dress runs small esp where the zipper area runs. i ordered the sp which typically fits me and it was very tight! the material on the top looks and feels very cheap that even just pulling on it will cause it to rip the fabric. pretty disappointed as it was going to be my christmas dress this year! needless to say it will be going back.'
review9 =  "if you are at least average height or taller, this may look good on you."
review10 = "sadly will be returning, but i'm sure i will find something to exchange it for!"
review11 = "Cute little dress fits tts. it is a little high waisted. good length for my 5'9 height. i like the dress, i'm just not in love with it. i dont think it looks or feels cheap. it appears just as pictured."
review12 = 'Loved the material, but i didnt really look at how long the dress was before i purchased both a large and a medium. im 5\'5" and there was atleast 5" of material at my feet. the gaps in the front are much wider than they look. felt like the dress just fell flat. both were returned. im usually a large and the med fit better. 36d 30 in jeans'
review13 = "I have been waiting for this sweater coat to ship for weeks and i was so excited for it to arrive. this coat is not true to size and made me look short and squat."
review14 = 'Very comfortable, material is good, cut out on sleeves flattering'
reviews = [review1, review2, review3, review4, review5, review6, review7, review8, review9, review10, review11, review12, review13, review14]
In [85]:
tokens = tokenizer.texts_to_sequences(reviews) 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[85], line 1
----> 1 tokens = tokenizer.texts_to_sequences(reviews)

NameError: name 'tokenizer' is not defined
In [86]:
tokens_pad = pad_sequences(tokens, maxlen=max_tokens)
tokens_pad.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[86], line 1
----> 1 tokens_pad = pad_sequences(tokens, maxlen=max_tokens)
      2 tokens_pad.shape

NameError: name 'pad_sequences' is not defined
In [87]:
mod_pred = model.predict(tokens_pad)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[87], line 1
----> 1 mod_pred = model.predict(tokens_pad)

NameError: name 'tokens_pad' is not defined
In [88]:
mod_pred
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[88], line 1
----> 1 mod_pred

NameError: name 'mod_pred' is not defined
In [89]:
df_pred = pd.DataFrame(mod_pred, index=reviews)
df_pred.rename(columns={0: 'Pred_Proba'}, inplace=True)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[89], line 1
----> 1 df_pred = pd.DataFrame(mod_pred, index=reviews)
      2 df_pred.rename(columns={0: 'Pred_Proba'}, inplace=True)

NameError: name 'mod_pred' is not defined
In [90]:
df_pred["Predicted_Feedbaack"] = df_pred["Pred_Proba"].apply(lambda x: "Recommended" if x>=0.5 else "Not Recommended")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[90], line 1
----> 1 df_pred["Predicted_Feedbaack"] = df_pred["Pred_Proba"].apply(lambda x: "Recommended" if x>=0.5 else "Not Recommended")

NameError: name 'df_pred' is not defined
In [91]:
df_pred
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[91], line 1
----> 1 df_pred

NameError: name 'df_pred' is not defined
In [92]:
compare = pd.DataFrame({"Model": ["NaiveBayes_count", "LogReg_count", "SVM_count", "Random Forest_count", 
                                  "AdaBoost_count", "NaiveBayes_tfidf", "LogReg_tfidf", "SVM_tfidf", 
                                  "Random Forest_tfidf", "AdaBoost_tfidf", "DL"],
                        
                        "F1_Score": [nb_count_f1, log_count_f1, svc_count_f1,
                                             rf_count_f1, ada_count_f1, nb_tfidf_f1, log_tfidf_f1,
                                             svc_tfidf_f1, rf_tfidf_f1, ada_tfidf_f1, DL_f1],
                        
                        "Recall_Score": [nb_count_rec, log_count_rec, svc_count_rec, 
                                                   rf_count_rec, ada_count_rec, 
                                                  nb_tfidf_rec, log_tfidf_rec, svc_tfidf_rec, 
                                                  rf_tfidf_rec, ada_tfidf_rec, DL_rec],
                        
                        "Average_Precision_Score": [nb_AP_count, log_AP_count, svc_AP_count, rf_AP_count,
                                                   ada_AP_count, nb_AP_tfidf, log_AP_tfidf, svc_AP_tfidf,
                                                   rf_AP_tfidf, ada_AP_tfidf, DL_AP]})

def labels(ax):
                        
    for p in ax.patches:
        width = p.get_width()                        # get bar length
        ax.text(width,                               # set the text at 1 unit right of the bar
                p.get_y() + p.get_height() / 2,      # get Y coordinate + X coordinate / 2
                '{:1.3f}'.format(width),             # set variable to display, 2 decimals
                ha = 'left',                         # horizontal alignment
                va = 'center')                       # vertical alignment
    
plt.figure(figsize=(15,30))
plt.subplot(311)
compare = compare.sort_values(by="Recall_Score", ascending=False)
ax=sns.barplot(x="Recall_Score", y="Model", data=compare, palette="Blues_d")
labels(ax)

plt.subplot(312)
compare = compare.sort_values(by="F1_Score", ascending=False)
ax=sns.barplot(x="F1_Score", y="Model", data=compare, palette="Blues_d")
labels(ax)


plt.subplot(313)
compare = compare.sort_values(by="Average_Precision_Score", ascending=False)
ax=sns.barplot(x="Average_Precision_Score", y="Model", data=compare, palette="Blues_d")
labels(ax)
plt.show();
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[92], line 5
      1 compare = pd.DataFrame({"Model": ["NaiveBayes_count", "LogReg_count", "SVM_count", "Random Forest_count", 
      2                                   "AdaBoost_count", "NaiveBayes_tfidf", "LogReg_tfidf", "SVM_tfidf", 
      3                                   "Random Forest_tfidf", "AdaBoost_tfidf", "DL"],
      4                         
----> 5                         "F1_Score": [nb_count_f1, log_count_f1, svc_count_f1,
      6                                              rf_count_f1, ada_count_f1, nb_tfidf_f1, log_tfidf_f1,
      7                                              svc_tfidf_f1, rf_tfidf_f1, ada_tfidf_f1, DL_f1],
      8                         
      9                         "Recall_Score": [nb_count_rec, log_count_rec, svc_count_rec, 
     10                                                    rf_count_rec, ada_count_rec, 
     11                                                   nb_tfidf_rec, log_tfidf_rec, svc_tfidf_rec, 
     12                                                   rf_tfidf_rec, ada_tfidf_rec, DL_rec],
     13                         
     14                         "Average_Precision_Score": [nb_AP_count, log_AP_count, svc_AP_count, rf_AP_count,
     15                                                    ada_AP_count, nb_AP_tfidf, log_AP_tfidf, svc_AP_tfidf,
     16                                                    rf_AP_tfidf, ada_AP_tfidf, DL_AP]})
     18 def labels(ax):
     20     for p in ax.patches:

NameError: name 'nb_count_f1' is not defined
In [ ]: